<a href="https://colab.research.google.com/github/gupta24789/sentiment-analysis/blob/main/03_logistic_regression_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter
from sklearn.linear_model import LogisticRegression

## Read Data

In [2]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/train.csv")
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/val.csv")

train_df.processed_tweet = train_df.processed_tweet.fillna('[]').apply(lambda x: eval(x) if x is not None else [])
val_df.processed_tweet = val_df.processed_tweet.fillna('[]').apply(lambda x: eval(x) if x is not None else [])

In [3]:
train_df.label.value_counts()

1.0    4000
0.0    4000
Name: label, dtype: int64

In [4]:
val_df.label.value_counts()

1    1000
0    1000
Name: label, dtype: int64

## Create Word Freq by label

In [5]:
pos_freq_dict = Counter(list(itertools.chain.from_iterable(train_df[train_df.label==1]['processed_tweet'].tolist())))
pos_freq_dict.most_common(10)

[(':)', 2866),
 (':-)', 530),
 ('thank', 507),
 (':d', 504),
 ('love', 322),
 ('follow', 306),
 ('...', 221),
 ('day', 193),
 ('good', 191),
 ('like', 186)]

In [6]:
neg_freq_dict = Counter(list(itertools.chain.from_iterable(train_df[train_df.label==0]['processed_tweet'].tolist())))
neg_freq_dict.most_common(10)

[(':(', 3636),
 (':-(', 404),
 ("i'm", 293),
 ('...', 268),
 ('miss', 242),
 ('pleas', 219),
 ('follow', 202),
 ('want', 192),
 ('like', 190),
 ('get', 189)]

## Create Features

- pos_freq : sum of positive freq of all unique words in tweet
- neg_freq : sum of negative freq of all unique words in the tweet

In [7]:
train_df['pos_freq'] = train_df.processed_tweet.apply(lambda x: np.sum([pos_freq_dict.get(w,0) for w in set(x)]))
train_df['neg_freq'] = train_df.processed_tweet.apply(lambda x: np.sum([neg_freq_dict.get(w,0) for w in set(x)]))

val_df['pos_freq'] = val_df.processed_tweet.apply(lambda x: np.sum([pos_freq_dict.get(w,0) for w in set(x)]))
val_df['neg_freq'] = val_df.processed_tweet.apply(lambda x: np.sum([neg_freq_dict.get(w,0) for w in set(x)]))

In [8]:
train_df['bias'] = 1
val_df['bias'] = 1

In [9]:
train_df.head(6)

Unnamed: 0,raw_tweet,processed_tweet,label,pos_freq,neg_freq,bias
0,Want to say a huge thanks to @WarriorAssaultS ...,"[want, say, huge, thank, ff, thank, support, :)]",1.0,3575.0,358.0,1
1,@jaynehh_ you just need a job and get a letter...,"[need, job, get, letter, work, place, say, wor...",1.0,958.0,464.0,1
2,"@knhillrocks HA yes, make it quick tho :D","[ha, ye, make, quick, tho, :d]",1.0,690.0,144.0,1
3,@shartyboy Thanks for texting me back :)) I'm ...,"[thank, text, back, :), i'm, text, tomorrow, :)]",1.0,3650.0,512.0,1
4,Laying out a greetings card range for print to...,"[lay, greet, card, rang, print, today, love, j...",1.0,990.0,240.0,1
5,#FollowFriday @CCIFCcanada @AdamEvnmnt @boxcal...,"[followfriday, top, engag, member, commun, wee...",1.0,3026.0,58.0,1


In [10]:
train_x = train_df[['bias','pos_freq','neg_freq']].fillna(0)
train_y = train_df.label.fillna(0).values

val_x = val_df[['bias','pos_freq','neg_freq']].fillna(0)
val_y = val_df.label.fillna(0).values

In [11]:
## Logistic Regression - ML model
model = LogisticRegression()
model.fit(train_x, train_y)

In [12]:
predicted_prob = model.predict_proba(train_x)
train_pred = (predicted_prob[:,0]<predicted_prob[:,1]).astype(int)

In [13]:
predicted_prob = model.predict_proba(val_x)
val_pred = (predicted_prob[:,0]<predicted_prob[:,1]).astype(int)

In [14]:
print(f"Train Accuracy : {sum(train_pred == train_y)/len(train_pred):.2f}")
print(f"Val Accuracy : {sum(val_pred == val_y)/len(val_pred):.2f}")

Train Accuracy : 0.99
Val Accuracy : 0.99


## Error Analysis

In [15]:
predicted_prob = model.predict_proba(val_x)
pred = (predicted_prob[:,0]<predicted_prob[:,1]).astype(int)
pd.Series(pred).value_counts()

1    1016
0     984
dtype: int64

In [16]:
## Get the index where pred goes wrong
for i,row in val_df[pred != val_y].sample(6).iterrows():
  print(f"Raw Tweet : {row['raw_tweet']}")
  print(f"Processed Tweet : {row['processed_tweet']}")
  print(f"True Label : {row['label']}")
  print(f"Pred Label : {pred[i]}")
  print("\n")

Raw Tweet : @bumkeyyfel they're not. : ( except for those two who kill people ene
Processed Tweet : ["they'r", 'except', 'two', 'kill', 'peopl', 'ene']
True Label : 0
Pred Label : 1


Raw Tweet : pats jay : (
Processed Tweet : ['pat', 'jay']
True Label : 0
Pred Label : 1


Raw Tweet : “@TheShoeBibles: Its mine.... http://t.co/I9rRNjyvUq” damnnn fineeeeeeeee omaigoshhhhhhh can I have you :((((
Processed Tweet : ['“', 'mine', '...']
True Label : 0
Pred Label : 1


Raw Tweet : Fuuuuuuuuuck me Jesus &gt;:(
Processed Tweet : ['fuuuck', 'jesu', '>:(']
True Label : 0
Pred Label : 1


Raw Tweet : I liked a @YouTube video http://t.co/93Z6WOVOs9 PASHA IS CRYING AFTER TERRIBLE DONATE :(
Processed Tweet : ['like', 'video']
True Label : 0
Pred Label : 1


Raw Tweet : @kiyomitsucashew it kInda does :-( and thank you!!! ill def do th
Processed Tweet : ['kinda', ':-(', 'thank', 'ill', 'def', 'th']
True Label : 0
Pred Label : 1




## Predict

In [17]:
import re
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [19]:
def predict(tweet):
  processed_tweet = process_tweet(tweet)
  pos_freq = np.sum([pos_freq_dict.get(w,0) for w in processed_tweet])
  neg_freq = np.sum([neg_freq_dict.get(w,0) for w in processed_tweet])

  row = np.array([[1, pos_freq, neg_freq]])
  prob = model.predict_proba(row)
  pred = 1 if prob[:,0]<prob[:,1] else 0
  return pred

In [20]:
tweet = "I hate this movies :("
print(predict(tweet))

0




In [21]:
tweet = "I love this movies"
print(predict(tweet))

1


