# Twitter Sentiment Analysis 

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv')
df

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0
...,...,...
29995,@Calumfan1 is it in any way related to photosh...,0
29996,@Swiz_NZ really? wow thats crap,0
29997,"At the 2010 lexus HS250h press event. Again, ...",0
29998,@karmicunderpath ooooh now there's a nice thou...,1


In [4]:
df['sentiment'].value_counts()

1    15000
0    15000
Name: sentiment, dtype: int64

In [5]:
df.isnull().sum()

twitts       0
sentiment    0
dtype: int64

Average tweet length

In [6]:
df2 = df['twitts'].apply(lambda x : len(x))
df2

0         44
1         92
2         64
3        135
4        131
        ... 
29995     50
29996     32
29997     94
29998     50
29999    109
Name: twitts, Length: 30000, dtype: int64

In [7]:
df2.sum(axis=0)/len(df2)

73.72103333333334

## SVM Model and Data Preparation 

In [8]:
def run_svm(df):
    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)


    clf = LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))
    
    return tfidf, clf

In [9]:
%%time
tfidf, clf = run_svm(df)


Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

Wall time: 546 ms


User Input Prediction

In [10]:
x = ['i am really happy. thanks a lot for coming with me']

In [11]:
clf.predict(tfidf.transform(x))

array([1], dtype=int64)

## Same model after cleaning

In [12]:
# Use my preprocess package whose real owner is instructor : https://github.com/laxmimerit

In [13]:
import preprocess_smz as pp

In [14]:
pp.__version__

'0.0.3'

In [15]:
df['twitts'] = df['twitts'].apply(lambda x: x.lower())

In [16]:
# function handling abbrevations

In [17]:
df['twitts'] = df['twitts'].apply(lambda x: pp.cont_exp(x))

In [18]:
# keep cleaning by removing and https
# pandas series apply function

df['twitts'] = df['twitts'].apply(lambda x: pp.remove_emails(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_urls(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_rt(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_html_tags(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_special_chars(x))

In [19]:
df

Unnamed: 0,twitts,sentiment
0,robbiebronniman sounds like a great night,1
1,damn the person who stolde my wallet may karma...,1
2,greetings from the piano bench photo,1
3,drewryanscott i love it i love you haha forget...,1
4,kissthestars pretty pretty pretty please pakid...,0
...,...,...
29995,calumfan1 is it in any way related to photoshop,0
29996,swiz_nz really wow thats crap,0
29997,at the 2010 lexus hs250h press event again can...,0
29998,karmicunderpath ooooh now there is a nice thought,1


In [20]:
run_svm(df)


Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



(TfidfVectorizer(), LinearSVC())

In [21]:
x

['i am really happy. thanks a lot for coming with me']

In [22]:
clf.predict(tfidf.transform(x))

array([1], dtype=int64)

## Fine Tuning Model

In [23]:
df

Unnamed: 0,twitts,sentiment
0,robbiebronniman sounds like a great night,1
1,damn the person who stolde my wallet may karma...,1
2,greetings from the piano bench photo,1
3,drewryanscott i love it i love you haha forget...,1
4,kissthestars pretty pretty pretty please pakid...,0
...,...,...
29995,calumfan1 is it in any way related to photoshop,0
29996,swiz_nz really wow thats crap,0
29997,at the 2010 lexus hs250h press event again can...,0
29998,karmicunderpath ooooh now there is a nice thought,1


In [38]:
# Changes in TFidfVectorizer

def run_svm(df):
    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer(norm = 'l1', ngram_range=(1,2), analyzer='word', max_features=5000)
    X = tfidf.fit_transform(X)
    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

    print('shape of X: ', X.shape)

    clf = LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))
    
    return tfidf, clf

run_svm(df)

shape of X:  (30000, 5000)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      3000
           1       0.77      0.75      0.76      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



(TfidfVectorizer(max_features=5000, ngram_range=(1, 2), norm='l1'),
 LinearSVC())

## Saving and Loading ML Model 

In [25]:
import pickle

In [26]:
pickle.dump(clf, open('classifier.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))

In [27]:
del clf
del tfidf

In [28]:
clf = pickle.load(open('classifier.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

In [29]:
clf

LinearSVC()

In [30]:
# tfidf.vocabulary_

In [31]:
x

['i am really happy. thanks a lot for coming with me']

In [32]:

clf.predict(tfidf.transform(x))

array([1], dtype=int64)

# Real-Time Twitter Sentiment Analysis 

In [33]:
consumer_key = 'R7DGimRNkT11sbngA0MRqLmNE'
consumer_secret = 'w5Axtw43feejwgmPIhqPhPOt1aHso1Guw1yuFwlmijtlh0vguK'
access_token = '1279486577656295425-l3gaKqKuHQdKl44rPXUc0WYcc26wgq'
access_token_secret = '80dGAdcx6LuoWM1mSt669V5NESP0EOuX1dK8Mianjqxi2'

In [34]:
# !pip install tweepy

In [35]:
import tweepy

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

public_tweets = api.home_timeline()


ModuleNotFoundError: No module named 'tweepy'

In [None]:
type(public_tweets)

In [None]:
public_tweets[0].text

In [None]:
for tweet in public_tweets:
    print(tweet.text)

## Tracking Keywords on Twitter 

In [27]:
import json
import pickle
import tweepy
import csv

In [5]:
from textblob import TextBlob

In [6]:
import preprocess_kgptalkie as pp

In [7]:
clf = pickle.load(open('clf.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))