In [144]:
#Here all imports will be done
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import pandas as pd
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [62]:
#Importing Dataset
pos_tweets=twitter_samples.strings('positive_tweets.json')
neg_tweets=twitter_samples.strings('negative_tweets.json')
print("Some Positive tweets :")
for i in range(5):
    print(i+1,"- ",pos_tweets[i])
print()
print("Some Negative tweets :")
for i in range(5):
    print(i+1,"- ",neg_tweets[i])
print()
print("Total Positive tweets =",len(pos_tweets))
print("Total Negative tweets =",len(neg_tweets))

Some Positive tweets :
1 -  #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
2 -  @Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
3 -  @DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
4 -  @97sides CONGRATS :)
5 -  yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days

Some Negative tweets :
1 -  hopeless for tmr :(
2 -  Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(
3 -  @Hegelbon That heart sliding into the waste basket. :(
4 -  “@ketchBurning: I hate Japanese call him "bani" :( :(”

Me too
5 -  Dang starting next week I have "work" :(

Total Positive tweets = 5000
Total Negative tweets = 5000


In [63]:
#Pre-Processing
def pre_process(tweets):
    # (http[s]{0,1}://(\S)*)|(www\.[\S]+\.[\w]{1,6}) -> for matching url
    r1=re.compile(r'(http[s]{0,1}://(\S)*)|(www\.[\S]+\.[\w]{1,6})',re.A)
    # (^rt(\s)+)|([\s]rt[\s]*) -> for matching retweet(rt) at start or in between ...
    r2=re.compile(r'(^rt(\s)+)|([\s]rt[\s]*)',re.M)
    # @(\S)+ -> for matching @username ...
    r3=re.compile(r'@(\S)+',re.A)
    # #(\S)+ ->for matching #hashtags
    r4=re.compile(r'#(\S)+',re.A)
    # [^\w\s]+ -> for matching non word characters like symbols,emoticons,punctuations
    r5=re.compile(r'[^\w\s]+',re.A)
    k=1
    stopword=stopwords.words('english') #imported a list of stopwords
    #print(stopword)
    processed_tweets=[]
    for tweet in tweets:
        tweet=tweet.lower() #converting tweet to lower case
        tweet=re.sub(r1," ",tweet)
        tweet=re.sub(r2," ",tweet)
        tweet=re.sub(r3," ",tweet)
        tweet=re.sub(r4," ",tweet)
        tweet=re.sub(r5," ",tweet)
        tweet=' '.join(tweet.split())
        #k+=1
        tokens=word_tokenize(tweet) #tokenization
        #removing stopwords
        filtered_words=[]
        for token in tokens:
            if token not in stopword:
                filtered_words.append(token)
        #stemming
        stemmed_words=[]
        ps=PorterStemmer()
        for word in filtered_words:
            stemmed_words.append(ps.stem(word))
        processed_tweet=' '.join(stemmed_words)
        processed_tweets.append(processed_tweet)
    return processed_tweets

In [64]:
processed_pos_tweets=pre_process(pos_tweets)
processed_neg_tweets=pre_process(neg_tweets)
print(processed_neg_tweets)

['hopeless tmr', 'everyth kid section ikea cute shame nearli 19 2 month', 'heart slide wast basket', 'hate japanes call bani', 'dang start next week work', 'oh god babi face', 'make smile', 'work neighbour motor ask said hate updat search', 'sialan', 'athabasca glacier', 'realli good amp g idea never go meet', 'mare ivan', 'happi trip keep safe see soon', 'tire hahahah', 'knee replac get amp day ouch', 'relat sweet n sour kind bi polar peopl life cuz life full', 'pleass', 'im sure tho', 'feel stupid seem grasp basic digit paint noth research help', 'good lord', 'feel lone someon talk guy girl', 'assign project realli', 'want play video game watch movi someon', 'choreograph hard', 'email link still say longer avail', 'cri bc miss mingm much', 'sorri', 'mom far away', 'truli sorri safe flight', 'friend', 'oh hate happen get sad', 'oh dog pee bag take', 'doushit', 'late', 'suck much sick plan start work first gundam night nope', '2 dollar', 'listen back old show know weird got u leav migh

In [5]:
# class1=[1 for i in range(5000)]
# pos_df=pd.DataFrame(list(zip(processed_pos_tweets,class1)),columns=['pro_tweet','t_class'])
# pos_df=shuffle(pos_df,random_state=0).reset_index(drop=True)

# class0=[0 for i in range(5000)]
# neg_df=pd.DataFrame(list(zip(processed_neg_tweets,class0)),columns=['pro_tweet','t_class'])
# neg_df=shuffle(neg_df,random_state=0).reset_index(drop=True)


In [83]:
def pos_score(row):
    if(row['t_polarity']>2.0 and row['t_polarity']<=4.0):
        return 2
    if(row['t_polarity']>0.0 and row['t_polarity']<=2.0):
        return 1
    if(row['t_polarity']==0.0):
        return 0
def neg_score(row):
    if(row['t_polarity']>=2.0 and row['t_polarity']<=4.0):
        return -2
    if(row['t_polarity']>0.0 and row['t_polarity']<2.0):
        return -1
    if(row['t_polarity']==0.0):
        return 0

In [84]:
#TF-IDF on positive tweets
vectorizer_pos= TfidfVectorizer(min_df=5,max_df=0.75,ngram_range=(1,3),max_features=10000)
vectors=vectorizer_pos.fit_transform(processed_pos_tweets)
feature_names = vectorizer_pos.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()    
pos_df= pd.DataFrame(denselist, columns=feature_names)

#Scoring
pos_df["t_polarity"]=pos_df.sum(axis=1)
pos_df['t_polarity'] =pos_df.apply(pos_score, axis='columns')
print(pos_df['t_polarity'].value_counts())
pos_df=pos_df.dropna()

1.0    2857
2.0    1703
0.0     280
Name: t_polarity, dtype: int64


In [85]:
pos_df

Unnamed: 0,000,10,100,11,12,13,15,16,17,20,...,yep,yesterday,yet,yo,youth,youth job,youth job opportun,youtub,yup,t_polarity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.616927,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [86]:
#TF-IDF on negative tweets
vectorizer_neg= TfidfVectorizer(min_df=5,max_df=0.75,ngram_range=(1,3),max_features=10000)
vectors=vectorizer_neg.fit_transform(processed_neg_tweets)
feature_names = vectorizer_neg.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
neg_df= pd.DataFrame(denselist, columns=feature_names)

#Scoring
neg_df["t_polarity"]=neg_df.sum(axis=1)
neg_df['t_polarity'] =neg_df.apply(neg_score, axis='columns')
print(neg_df['t_polarity'].value_counts())
neg_df=neg_df.dropna()
neg_df

-1.0    2984
-2.0    1606
 0.0     365
Name: t_polarity, dtype: int64


Unnamed: 0,07,10,100,11,12,15,20,2015,2015 07,24,...,ye,yeah,year,yep,yesterday,yet,your,youtub,zayn,t_polarity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0


In [135]:
#Finale dataframe
df= pos_df.append(neg_df, ignore_index=True)
print(df["t_polarity"].value_counts())
df.fillna(-99999,inplace=True)
df

-1.0    2984
 1.0    2857
 2.0    1703
-2.0    1606
 0.0     645
Name: t_polarity, dtype: int64


Unnamed: 0,000,10,100,11,12,13,15,16,17,20,...,wish could,woman,women,wors,worst,wtf,x15,xd,your,zayn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.616927,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9790,-99999.0,0.0,0.0,0.0,0.0,-99999.0,0.000000,-99999.0,-99999.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9791,-99999.0,0.0,0.0,0.0,0.0,-99999.0,0.000000,-99999.0,-99999.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9792,-99999.0,0.0,0.0,0.0,0.0,-99999.0,0.000000,-99999.0,-99999.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9793,-99999.0,0.0,0.0,0.0,0.0,-99999.0,0.000000,-99999.0,-99999.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
y=df["t_polarity"]
X=df.drop(["t_polarity"], 1)

In [89]:
X

Unnamed: 0,000,10,100,11,12,13,15,16,17,20,...,wish could,woman,women,wors,worst,wtf,x15,xd,your,zayn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.616927,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9790,-99999.0,0.0,0.0,0.0,0.0,-99999.0,0.000000,-99999.0,-99999.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9791,-99999.0,0.0,0.0,0.0,0.0,-99999.0,0.000000,-99999.0,-99999.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9792,-99999.0,0.0,0.0,0.0,0.0,-99999.0,0.000000,-99999.0,-99999.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9793,-99999.0,0.0,0.0,0.0,0.0,-99999.0,0.000000,-99999.0,-99999.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
#Partitioning into training and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

A. SUPPORT VECTOR REGRESSION

In [116]:
from sklearn.svm import SVR
model1=SVR()
model1.fit(X_train,y_train)

SVR()

In [117]:
acc_svr=model1.score(X_test, y_test)

In [119]:
acc_svr

0.813374716331852

B. DECISION TREE

In [145]:
from sklearn.tree import DecisionTreeClassifier
model2=DecisionTreeClassifier(random_state=1)
model2.fit(X_train, y_train)
y_pred2_test=model2.predict(X_test)
acc_dtree=metrics.accuracy_score(y_test, y_pred2_test)
print(acc_dtree)

0.857434501531133


In [148]:
print(classification_report(y_test,y_pred2_test))

              precision    recall  f1-score   support

        -2.0       0.85      0.83      0.84       465
        -1.0       0.91      0.85      0.88       900
         0.0       0.60      1.00      0.75       202
         1.0       0.90      0.86      0.88       843
         2.0       0.89      0.84      0.86       529

    accuracy                           0.86      2939
   macro avg       0.83      0.88      0.84      2939
weighted avg       0.87      0.86      0.86      2939



C.RANDOM FOREST

In [154]:
from sklearn.ensemble import RandomForestClassifier
model3= RandomForestClassifier()
model3.fit(X_train,y_train)
y_pred3_test =model3.predict(X_test)
acc_rf=metrics.accuracy_score(y_test, y_pred3_test)
print(acc_rf)

0.8363388907791766


In [155]:
print(classification_report(y_test,y_pred3_test))

              precision    recall  f1-score   support

        -2.0       0.94      0.72      0.81       465
        -1.0       0.86      0.89      0.87       900
         0.0       0.55      1.00      0.71       202
         1.0       0.83      0.88      0.86       843
         2.0       0.96      0.72      0.82       529

    accuracy                           0.84      2939
   macro avg       0.83      0.84      0.82      2939
weighted avg       0.86      0.84      0.84      2939



D. MULTINOMIAL LOGISTIC REGRESSION 

In [156]:
from sklearn.linear_model import LogisticRegression
model4= LogisticRegression(multi_class='multinomial',solver='newton-cg')
model4.fit(X_train, y_train)
y_pred4_test =model4.predict(X_test)
acc_lr=metrics.accuracy_score(y_test, y_pred4_test)
print(acc_lr)



0.6628104797550187




In [157]:
print(classification_report(y_test,y_pred4_test))

              precision    recall  f1-score   support

        -2.0       0.80      0.29      0.43       465
        -1.0       0.66      0.96      0.79       900
         0.0       0.00      0.00      0.00       202
         1.0       0.63      0.93      0.75       843
         2.0       0.73      0.31      0.44       529

    accuracy                           0.66      2939
   macro avg       0.56      0.50      0.48      2939
weighted avg       0.64      0.66      0.60      2939



  _warn_prf(average, modifier, msg_start, len(result))
