In [1]:
import numpy as np
import pandas as pd 

In [23]:
# Load the Datasets
train = pd.read_csv("../../Large_data/nlp-getting-started/train.csv")
test = pd.read_csv("../../Large_data/nlp-getting-started/test.csv")
submission = pd.read_csv("../../Large_data//nlp-getting-started/sample_submission.csv")

In [17]:
# First 5 row for train dataset
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [18]:
# First 5 row for test dataset
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [19]:
print(f"Train dataset shape {train.shape}")
print(f"Test dataset shape {test.shape}")

Train dataset shape (7613, 5)
Test dataset shape (3263, 4)


In [20]:
print(f"Null value for train dataset: {train.isnull().mean()}")
print("--------------------")
print(f"Null value for train dataset: {test.isnull().mean()}")

Null value for train dataset: id          0.000000
keyword     0.008013
location    0.332720
text        0.000000
target      0.000000
dtype: float64
--------------------
Null value for train dataset: id          0.000000
keyword     0.007968
location    0.338645
text        0.000000
dtype: float64


In [25]:
#Removing Keyword and location columns
train = train.drop(labels=['keyword','location'], axis=1)
test = test.drop(labels=['keyword','location'], axis=1)

In [26]:
train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


# Text processing

In [27]:
def clean_data(name):
    # Replace email addresses with 'email'
    processed = name.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                     'emailaddress')

    # Replace URLs with 'webaddress'
    processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                      'webaddress')

    # Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
    processed = processed.str.replace(r'£|\$', 'moneysymb')

    # Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
    processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                      'phonenumbr')

    # Replace numbers with 'numbr'
    processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

    # Remove punctuation
    processed = processed.str.replace(r'[^\w\d\s]', ' ')

    # Replace whitespace between terms with a single space
    processed = processed.str.replace(r'\s+', ' ')

    # Remove leading and trailing whitespace
    processed = processed.str.replace(r'^\s+|\s+?$', '')

    # change words to lower case - Hello, HELLO, hello are all the same word
    processed = processed.str.lower()
    
    return processed

In [28]:
clean_train = clean_data(train["text"])
clean_test = clean_data(test["text"])

# The process of converting data to something a computer can understand is referred to as pre-processing. One of the major forms of pre-processing is to filter out useless data. In natural language processing, useless words (data), are referred to as stop words.

In [29]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

clean_train = clean_train.apply(lambda x:" ".join(term for term in x.split() if term not in stop_words))

clean_test = clean_test.apply(lambda x:" ".join(term for term in x.split() if term not in stop_words))

In [30]:
clean_train

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       numbr numbr people receive wildfires evacuatio...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    aria_ahrary thetawniest control wild fires cal...
7610    mnumbr numbr numbr utc numbrkm volcano hawaii ...
7611    police investigating e bike collided car littl...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

Stemming is the process of producing morphological variants of a root/base word. Stemming programs are commonly referred to as stemming algorithms or stemmers. A stemming algorithm reduces the words “chocolates”, “chocolatey”, “choco” to the root word, “chocolate” and “retrieval”, “retrieved”, “retrieves” reduce to the stem “retrieve”.

In [31]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

clean_train = clean_train.apply(lambda x:" ".join([ps.stem(word) for word in x.split()]))

clean_test = clean_test.apply(lambda x:" ".join([ps.stem(word) for word in x.split()]))

In [32]:
clean_train

0               deed reason earthquak may allah forgiv us
1                    forest fire near la rong sask canada
2       resid ask shelter place notifi offic evacu she...
3       numbr numbr peopl receiv wildfir evacu order c...
4       got sent photo rubi alaska smoke wildfir pour ...
                              ...                        
7608    two giant crane hold bridg collaps nearbi home...
7609    aria_ahrari thetawniest control wild fire cali...
7610    mnumbr numbr numbr utc numbrkm volcano hawaii ...
7611    polic investig e bike collid car littl portug ...
7612    latest home raze northern california wildfir a...
Name: text, Length: 7613, dtype: object

 Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meaning to one word. Text preprocessing includes both Stemming as well as Lemmatization.

# Applications of lemmatization are:

*      Used in comprehensive retrieval systems like search engines.
*      Used in compact indexing


In [33]:
from nltk.stem import WordNetLemmatizer

wl = WordNetLemmatizer()

clean_train = clean_train.apply(lambda x:" ".join([wl.lemmatize(word) for word in x.split()]))

clean_test = clean_test.apply(lambda x:" ".join([wl.lemmatize(word) for word in x.split()]))

In [34]:
clean_test

0                                happen terribl car crash
1           heard earthquak differ citi stay safe everyon
2       forest fire spot pond gee flee across street c...
3                          apocalyps light spokan wildfir
4                typhoon soudelor kill numbr china taiwan
                              ...                        
3258      earthquak safeti lo angel ûò safeti fasten xrwn
3259    storm ri wors last hurrican citi amp numbroth ...
3260         green line derail chicago http co utbxlcbiuy
3261    meg issu hazard weather outlook hwo http co nu...
3262      cityofcalgari activ municip emerg plan yycstorm
Name: text, Length: 3263, dtype: object

In [35]:
train["text"] = clean_train
test["text"] = clean_test

In [36]:
# Spliting train and test set

from sklearn.model_selection import train_test_split

seed = 42

X = train.text
y = train.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

In [37]:
# some important libraries

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [38]:
# accuracy score function

def acc_summary(pipeline, X_train, y_train, X_test, y_test):
    sentiment_fit = pipeline.fit(X_train, y_train)
    y_pred = sentiment_fit.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
   
    print("-"*30)
    
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    
    print("-"*30)
    
    return accuracy

In [39]:
# some model and their performance

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "Bernouli", "PassiveAggressiveClassifier",
     "Naive Bayes", "SVC"]

classifiers = [
    KNeighborsClassifier(n_neighbors=3),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(n_estimators=100),
    LogisticRegression(),
    MultinomialNB(),
    BernoulliNB(),
    PassiveAggressiveClassifier(max_iter=50),
    SVC(kernel="linear")
]
    
zipped_clf = zip(names, classifiers)
tvec = TfidfVectorizer()
    
def compare_clf(classifier=zipped_clf, vectorizer=tvec, n_features=10000, ngram_range=(1, 1)):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n, c in classifier:
        checker_pipeline = Pipeline([
            ("vectorizer", vectorizer),
            ("classifier", c)
        ])
        clf_acc = acc_summary(checker_pipeline, X_train, y_train, X_test, y_test)
        print("Model result for {}".format(n))
        print(c)
        result.append((n, clf_acc))
    return result

In [40]:
trigram_result = compare_clf()

------------------------------
accuracy score: 70.32%
------------------------------
Model result for K Nearest Neighbors
KNeighborsClassifier(n_neighbors=3)
------------------------------
accuracy score: 72.94%
------------------------------
Model result for Decision Tree
DecisionTreeClassifier(random_state=0)
------------------------------
accuracy score: 78.94%
------------------------------
Model result for Random Forest
RandomForestClassifier()
------------------------------
accuracy score: 80.69%
------------------------------
Model result for Logistic Regression
LogisticRegression()
------------------------------
accuracy score: 80.65%
------------------------------
Model result for Bernouli
MultinomialNB()
------------------------------
accuracy score: 80.91%
------------------------------
Model result for PassiveAggressiveClassifier
BernoulliNB()
------------------------------
accuracy score: 73.34%
------------------------------
Model result for Naive Bayes
PassiveAggressiveC

In [41]:
trigram_result

[('K Nearest Neighbors', 0.7031523642732049),
 ('Decision Tree', 0.7294220665499125),
 ('Random Forest', 0.7894045534150613),
 ('Logistic Regression', 0.8069176882661997),
 ('Bernouli', 0.8064798598949212),
 ('PassiveAggressiveClassifier', 0.809106830122592),
 ('Naive Bayes', 0.7333625218914186),
 ('SVC', 0.797723292469352)]

# Implement the best model

In [53]:
# Use TfidfVectorizer
# use of pipeline
vectorizer=TfidfVectorizer()
pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', LogisticRegression())
        ])
vectorizer.set_params(stop_words=None, max_features=100000, ngram_range=(1,4))
sentiment_fit = pipeline.fit(X_train,y_train)
y_pred = sentiment_fit.predict(X_test)

In [55]:
y_pred

array([0, 0, 1, ..., 1, 1, 0])

In [56]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cm

In [58]:
accuracy_score(y_test, y_pred)

0.7802101576182137

In [59]:
import pickle
pickle.dump(sentiment_fit, open('RealOrfake.pkl','wb'))

In [71]:
model = pickle.load(open('RealOrfake.pkl','rb'))
series = pd.Series("This is  bitches, flood and earthquake")
data = clean_data(series)
value = model.predict(data)

In [72]:
if value == 1:
    print("Real tweet")
else:
    print("fake tweet")

Real tweet


In [66]:
def clean_data(name):
    processed = name.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddress')
    processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')
    processed = processed.str.replace(r'£|\$', 'moneysymb')
    processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumbr')
    processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')
    processed = processed.str.replace(r'[^\w\d\s]', ' ')
    processed = processed.str.replace(r'\s+', ' ')
    processed = processed.str.replace(r'^\s+|\s+?$', '')
    processed = processed.str.lower()
    return processed