In [1]:
# import necessary libraries
import pandas as pd
import re
import string,time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer

# define utility funtions

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

def remove_punc(text):
    exclude = string.punctuation
    for char in exclude:
        text = text.replace(char,'')
    return text

def remove_punc1(text):
    exclude = string.punctuation
    return text.translate(str.maketrans('', '', exclude))


def chat_conversion(text):
    chat_words = {
        'AFAIK':'as far as i know',
        'AFK':'away from keyboard',
        'ASAP':'as soon as possible'
    }
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

def remove_stopwords(text):
    new_text = []

    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_stop_words(text):
    sentence = word_tokenize(text)
    words = [ word for word in sentence if not word in stopwords.word('english')]
    return ' '.join(words)

def remove_stop_words(text):
    sentence = word_tokenize(text)
    words = [ word for word in sentence if not word in stopwords.words('english')]
    return ' '.join(words)

def lemmatize_words(text):
    lem = WordNetLemmatizer()
    sentence = word_tokenize(text)
    words = [ lem.lemmatize(word) for word in sentence]
    return ' '.join(words)
    
def stem_words(text):
    stemmer = PorterStemmer()
    sentence = word_tokenize(text)
    words = [ stemmer.stem(word) for word in sentence]
    return ' '.join(words)

In [2]:
# load data set
df = pd.read_csv("data/Twitter Sentiments.csv");
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
df['tweet'][0]

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [4]:
df.shape

(31962, 3)

# Steps for data preprocessing

### 1.lower case
### 2.remove_html_tags
### 3.remove_url
### 4.punctuation handling
### 5.chat_conversion handle
### 6.incorrect_text handling
### 7.stopwords
### 8.remove_emoji handle
### 9.Tokenization
### 10.Stemming 
### 11.Lemmatizing

In [5]:
df_copy = df.copy()
from nltk.tokenize import sent_tokenize

In [6]:
# lowering
df_copy['tweet'] = df_copy['tweet'].str.lower()
df_copy.tweet[0]

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [7]:
#remove html tags
df_copy['tweet'] = df_copy['tweet'].apply(remove_html_tags)

In [8]:
# remove URLs
df_copy['tweet'] = df_copy['tweet'].apply(remove_url)

In [9]:
# remove punctuation
df_copy['tweet'] = df_copy['tweet'].apply(remove_punc1)

In [10]:
df_copy.tweet[3]

'model   i love u take with u all the time in urð\x9f\x93± ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  '

In [11]:
# chat conversion
df_copy.tweet[4018]

'user hey when are u guys gonna release the gtx 1080 strix in australia i needa buy it asap cant wait no longer hahahahaaa  '

In [12]:
df_copy['tweet'] = df_copy['tweet'].apply(chat_conversion)
df_copy.tweet[4018]

'user hey when are u guys gonna release the gtx 1080 strix in australia i needa buy it as soon as possible cant wait no longer hahahahaaa'

In [13]:
#incorrect text handling
from textblob import TextBlob


In [14]:
#df_copy['tweet'] = df_copy['tweet'].apply(lambda text: TextBlob(text).correct().string)

In [15]:
# remove stopwords
df_copy['tweet'] = df_copy['tweet'].apply(remove_stop_words)
df_copy.tweet[4018]

'user hey u guys gon na release gtx 1080 strix australia needa buy soon possible cant wait longer hahahahaaa'

In [16]:
# remove emojis
df_copy['tweet'] = df_copy['tweet'].apply(remove_emoji)
df_copy.tweet[3]

'model love u take u time urð\x9f\x93± ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91 ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦'

In [17]:
#remove special characters
reg = r'[^a-zA-Z]'
df_copy['tweet'] = df_copy['tweet'].apply(lambda text : re.sub(reg,' ',text))
df_copy.tweet[3]

'model love u take u time ur                                 '

In [18]:
#lemmatizing the words
df_copy['tweet'] = df_copy['tweet'].apply(lemmatize_words)

In [19]:
df_copy['tweet']

0        user father dysfunctional selfish drag kid dys...
1        user user thanks lyft credit cant use cause do...
2                                           bihday majesty
3                              model love u take u time ur
4                            factsguide society motivation
                               ...                        
31957                                   ate user isz youuu
31958    see nina turner airwave trying wrap mantle gen...
31959       listening sad song monday morning otw work sad
31960    user sikh temple vandalised calgary wso condem...
31961                                    thank user follow
Name: tweet, Length: 31962, dtype: object

In [20]:
df_copy.label.value_counts()


0    29720
1     2242
Name: label, dtype: int64

### data is highly imbalanced

In [21]:
print(df_copy.shape)
df_copy2 = df_copy.copy()
import numpy as np
# ls = np.where(df_copy['label'] == 0)
# len(ls[0])
indices = df_copy.index[df_copy['label'] == 0].tolist()

indices[0:3000]
# df_copy.label.value_counts()  
df_new = df_copy.copy()
df_new = df_new.drop(index= indices[3000:])


(31962, 3)


In [22]:
df_new.label.value_counts()

0    3000
1    2242
Name: label, dtype: int64

In [23]:
df_copy = df_new


# Bag Of Words

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [25]:
bow = cv.fit_transform(df_copy['tweet'])

In [26]:
len(cv.vocabulary_)

11119

In [27]:
bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [28]:
print(bow[0].toarray())

[[0 0 0 ... 0 0 0]]


In [29]:
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
X_train_bow ,X_test_bow,Y_train,Y_test = train_test_split(bow.toarray(),df_copy['label'],test_size=0.20,random_state=0)

In [30]:
print(X_train_bow.shape,X_test_bow.shape,Y_train.shape,Y_test.shape)

(4193, 11119) (1049, 11119) (4193,) (1049,)


## Apply ML

In [31]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_bow,Y_train)
y_pred = gnb.predict(X_test_bow)
accuracy_score(y_pred,Y_test)

0.7569113441372736

In [32]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_bow,Y_train)
y_pred = rfc.predict(X_test_bow)
accuracy_score(y_pred,Y_test)

0.8217349857006673

# TFIDF

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
tf = TfidfVectorizer()
tfidfArray = tf.fit_transform(df_copy['tweet']).toarray()
tfidfArray

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
tf.idf_

array([8.87150195, 8.87150195, 8.87150195, ..., 8.87150195, 8.87150195,
       8.87150195])

In [36]:
X_train ,X_test,Y_train,Y_test = train_test_split(tfidfArray,df_copy['label'],test_size=0.20,random_state=0)
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)
y_pred = rfc.predict(X_test)
accuracy_score(y_pred,Y_test)

0.8245948522402288

## Word to vec

In [37]:
tokenized_tweet = df_copy2['tweet'].apply(lambda x: x.split()) # tokenizing 

from gensim.models import Word2Vec
model_w2v = Word2Vec(
    window=10,
    min_count=2,
)
model_w2v.build_vocab(tokenized_tweet)
model_w2v.train(tokenized_tweet, total_examples=model_w2v.corpus_count, epochs=model_w2v.epochs)

(1098246, 1342810)

In [38]:
model_w2v.wv.most_similar(positive="dinner")

[('la', 0.9956013560295105),
 ('reminder', 0.9955602884292603),
 ('loveyou', 0.9951502680778503),
 ('design', 0.9947088360786438),
 ('humpday', 0.9946856498718262),
 ('bunny', 0.9946749806404114),
 ('beer', 0.9945499300956726),
 ('paradise', 0.9945416450500488),
 ('paris', 0.9944410920143127),
 ('rainbow', 0.994280993938446)]

In [39]:
y = model_w2v.wv.index_to_key

In [40]:
#y
print(tokenized_tweet[0])
model_w2v.wv['user']

['user', 'father', 'dysfunctional', 'selfish', 'drag', 'kid', 'dysfunction', 'run']


array([ 0.36760587,  0.06352774,  0.26577964, -0.07383872, -0.17611517,
       -0.80968654,  0.6607155 ,  1.9512951 , -0.7366356 , -0.4459084 ,
       -0.05949062, -0.79431295,  0.03478067,  0.54327583, -0.1461035 ,
       -0.57407945,  0.20170864, -0.81982255, -0.38568732, -1.7894647 ,
        1.1517943 ,  0.89570993,  0.6322998 , -0.2586831 ,  0.06424015,
       -0.5588366 , -0.13685638, -0.15524995, -0.8803973 , -0.17639638,
        0.7059247 , -0.3276034 ,  0.548378  , -0.94819367, -0.28550637,
        0.11850192,  0.6504269 , -0.9252527 , -0.08110474, -1.2786338 ,
       -0.14183562, -0.38168025, -1.3521514 , -0.24450126,  0.53713745,
       -0.37213984, -0.99861693,  0.3508669 ,  0.77051044,  0.9265397 ,
        0.5096165 , -0.28675443,  0.17559634, -0.6821249 ,  0.29306605,
        0.5095224 , -0.00820093, -0.10140014, -0.65651923,  0.12613136,
       -0.00484333, -0.0870612 , -0.13071802, -0.36670482, -0.8388478 ,
        1.0297117 , -0.35016438,  0.18115984, -1.0347841 ,  0.87

## Lets balance the data set

## Convert to vector

In [41]:
vector=[]
from tqdm import tqdm
for sent in tqdm(tokenized_tweet):
    sent_vec=np.zeros(100)
    count =0
    for word in sent: 
        if word in list(model_w2v.wv.index_to_key):
            vec = model_w2v.wv[word]
            sent_vec += vec 
            count += 1
    if count != 0:
        sent_vec /= count #normalize
    vector.append(sent_vec)
print(len(vector))
print(len(vector[0])) 

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31962/31962 [00:54<00:00, 583.35it/s]

31962
100





In [42]:
w2v_data=pd.DataFrame(vector)
w2v_data['label'] = df_copy2['label']

In [43]:
import seaborn as sns
from sklearn.utils import resample
major_class_0,major_class_1=w2v_data.label.value_counts()
df_major=w2v_data[w2v_data['label']==0]
df_minor=w2v_data[w2v_data['label']==1]
df_minor_upsampled = resample(df_minor, 
                                 replace=True,     # sample with replacement
                                 n_samples=major_class_0)
df_wv_upsampled = pd.concat([df_major, df_minor_upsampled])
print('shape',df_wv_upsampled.shape)
#sns.countplot(df_wv_upsampled.label)
df_wv_upsampled.label.value_counts()

shape (59440, 101)


0    29720
1    29720
Name: label, dtype: int64

In [44]:
Y = df_wv_upsampled['label']
X = df_wv_upsampled.loc[:,:99]
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.020313,0.462140,0.012661,-0.144501,0.148504,-0.648105,0.527134,0.943230,-0.232091,-0.365398,...,0.568116,-0.088100,0.387907,0.124144,0.577395,0.261408,0.064578,-0.520902,0.185323,0.210054
1,0.035847,0.031578,0.125603,0.060391,0.020389,-0.444933,0.309471,0.811384,-0.366124,-0.107799,...,0.359426,-0.040718,0.146234,0.210180,0.644221,0.083951,0.002492,-0.568227,0.260416,-0.026832
2,-0.567864,0.733062,0.271392,-0.237087,0.068107,-0.765012,0.137128,0.950810,-0.012714,-0.699011,...,0.661913,-0.082153,0.858035,0.054140,0.743801,0.794076,-0.052192,-0.526138,0.317563,-0.211564
3,-0.530216,0.432435,-0.454670,0.430926,-0.368931,-0.501042,1.304811,1.478216,-0.623380,0.271630,...,1.446574,-0.352496,0.792822,0.012822,1.252491,0.234668,0.298770,-1.314121,0.494426,1.025854
4,-0.351796,0.303203,0.047363,0.070058,0.006580,-0.557374,0.092473,0.350262,-0.175360,0.003295,...,0.284458,0.113890,0.228271,-0.092680,0.173053,0.133277,0.282012,-0.338521,0.207774,0.312106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20138,0.135971,0.166409,0.252821,0.016684,-0.207263,-0.703671,0.430273,1.462677,-0.495254,-0.259997,...,0.475945,0.180805,0.431280,0.303666,1.084358,0.472990,-0.147725,-0.626558,0.116376,-0.299784
19558,0.093969,0.115171,0.177611,-0.017065,0.035728,-0.618576,0.410690,1.053110,-0.402227,-0.023418,...,0.465240,0.093034,0.253926,0.254271,0.838799,0.216737,0.082745,-0.693722,0.240930,-0.065664
2608,-0.040647,0.148310,0.063807,0.020884,0.003450,-0.393204,0.249428,0.565020,-0.221947,0.064307,...,0.303689,0.068904,0.198815,0.089749,0.412443,0.118738,0.203688,-0.418939,0.170161,0.110949
8070,-0.058705,0.045459,0.098280,0.133139,0.407504,-0.755562,-0.008003,0.941622,-0.026578,-0.078224,...,0.463418,0.012618,0.145364,0.291273,0.647647,0.222792,0.686774,-0.700369,0.325758,0.105711


In [45]:
X_train ,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state=42)
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)
y_pred = rfc.predict(X_test)
accuracy_score(y_pred,Y_test)

0.992849932705249

In [46]:
from sklearn.neighbors import KNeighborsClassifier

In [47]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train,Y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_pred,Y_test)

0.8775235531628532