In [93]:
##Load the dataset

import pandas as pd

dataset=pd.read_csv('final_dataset_basicmlmodel.csv')
dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [94]:
##Data Cleaning
##Remove special characters like #.,/<> etc. , noise like unicode characters â and ð, numerals and percentages

import re #regular expression library

def dataclean(text):
    text=re.sub(r'[^\x00-\x7F]+','',text)    ##remove all characters which are not in ASCII code 0 to 127 (7F)
    text=re.sub(r'[^a-zA-Z\']',' ',text)       ##remove all characters which is not a-z or A-Z
    text=text.lower()
    return text


dataset['clean_text']=dataset.tweet.apply(lambda x: dataclean(x))
dataset.head()

Unnamed: 0,id,label,tweet,clean_text
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide society now motivation


In [95]:
###Feature engineering: creating numerical features to understand underlying text

##Define all functions to extract features like, number of words, number of neation words, number of prompt(question) words, 
##number of rare words in tweets 


from wordcloud import STOPWORDS

##Generate word frequency
def generate_freq(text):
    word_list=[]
    
    for word in text.split():
        word_list.extend(word)
    
    #Create word frequency using word list
    word_freq=pd.Series(word_list).value_counts()
    
    word_freq=word_freq.drop(STOPWORDS,errors='ignore') ## Drop the words which are STOPWORDS
    
    return word_freq

##check whether a negation term is present in the text
def check_neg(words):
    for word in words:
        if word in ['n','no','not','non'] or re.search(r"\wn't",word):
            return 1
    else:
            return 0

##check whether one of the 100 rare words are present in the text or not
def check_rare(words,rare_100):
    for word in words:
        if word in rare_100:
            return 1
    else:
            return 0

##check whether prompt words are present in it or not
def isquestion(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
            return 0       

In [96]:
##Use the functions defined to extract features from the dataset and add columns to it

word_freq=generate_freq(dataset.clean_text.str)
#print(word_freq[-100:])
rare_100=word_freq[-100:]

#Number of words in a tweet
dataset['wordcount']=dataset.clean_text.str.split().apply(lambda x: len(x))
dataset['isneg']=dataset.clean_text.str.split().apply(lambda x: check_neg(x))
dataset['israre']=dataset.clean_text.str.split().apply(lambda x: check_rare(x,rare_100))
dataset['isprompt']=dataset.clean_text.str.split().apply(lambda x: isquestion(x))
dataset['charactercount']=dataset.clean_text.apply(lambda x: len(x))
#dataset
dataset.head()

Unnamed: 0,id,label,tweet,clean_text,wordcount,isneg,israre,isprompt,charactercount
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...,18,0,1,1,102
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...,19,1,0,0,122
2,3,0,bihday your majesty,bihday your majesty,3,0,0,0,21
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...,12,0,0,0,54
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0,0,39


In [105]:
#### Splitting the dataset into train and test sets

from sklearn.model_selection import train_test_split

X=dataset[['wordcount','isneg','israre','isprompt','charactercount']]
y=dataset.label

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=27)

print(X_train.head(),y_train.head())

      wordcount  isneg  israre  isprompt  charactercount
2711          9      0       0         0             100
1374          9      0       0         0              53
2354         13      0       0         0              75
4             4      0       0         0              39
5121         20      0       0         0             108 2711    0
1374    0
2354    0
4       0
5121    1
Name: label, dtype: int64


In [107]:
#### Train a ML model for text classification
### We will use Naive Bayes Classification alogrithm

from sklearn.naive_bayes import GaussianNB

#Initialize GaussianDB classifier
model=GaussianNB()

##Fit the model in the training set
model.fit(X_train,y_train)

###Make predictions on the test dataset
pred=model.predict(X_test)

In [110]:
####Evaluate the model, check the accuracy

from sklearn.metrics import accuracy_score

accuracy=accuracy_score(pred,y_test)
print("Accuracy:- ",accuracy*100,"%")

Accuracy:-  59.8095238095 %
