### Importing Libraries

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import re

print("Tensorflow Version",tf.__version__)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thappana\AppData\Roaming\nltk_data...


Tensorflow Version 2.9.1


[nltk_data]   Unzipping corpora\stopwords.zip.


### About this file

This is the sentiment140 dataset.
It contains 1,600,000 tweets extracted using the twitter api . 

The tweets have been annotated (0 = negative, 2 = neutral, 4 = positive) and they can be used to detect sentiment .

It contains the following 6 fields:

1. `target`: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

2. `ids`: The id of the tweet ( 2087)

3. `date`: the date of the tweet (Sat May 16 23:58:44 UTC 2009)

4. `flag`: The query (lyx). If there is no query, then this value is NO_QUERY.

5. `user`: the user that tweeted (robotickilldozr)

6. `text`: the text of the tweet (Lyx is cool)

1. Read the .csv file and set it as a Dataframe called text_data. Check the head, info, and describe methods on the Dataframe(3 Marks)

In [2]:
file_path = "./training.1600000.processed.noemoticon.csv"
text_data = pd.read_csv(file_path, encoding = 'latin')
text_data.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
text_data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [3]:
print("Shape of text data: ",text_data.shape)

Shape of text data:  (1599999, 6)


In [4]:
text_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1599999 non-null  int64 
 1   ids     1599999 non-null  int64 
 2   date    1599999 non-null  object
 3   flag    1599999 non-null  object
 4   user    1599999 non-null  object
 5   text    1599999 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [5]:
text_data.describe(include='all')

Unnamed: 0,target,ids,date,flag,user,text
count,1599999.0,1599999.0,1599999,1599999,1599999,1599999
unique,,,774362,1,659775,1581465
top,,,Mon Jun 15 12:53:14 PDT 2009,NO_QUERY,lost_dog,isPlayer Has Died! Sorry
freq,,,20,1599999,549,210
mean,2.000001,1998818000.0,,,,
std,2.000001,193575700.0,,,,
min,0.0,1467811000.0,,,,
25%,0.0,1956916000.0,,,,
50%,4.0,2002102000.0,,,,
75%,4.0,2177059000.0,,,,


In [6]:
# Dropping columns which are not required
text_data = text_data.drop(['ids', 'date', 'flag', 'user'], axis=1)
text_data.head()

Unnamed: 0,target,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [7]:
# There is a 50-50 ratio for positive and negative sentiments
text_data['target'].value_counts()

4    800000
0    799999
Name: target, dtype: int64

In [8]:
num_to_emot = {0:"Negative", 4:"Positive"}
def label_decoder(label):
  return num_to_emot[label]

text_data['target'] = text_data['target'].apply(lambda x: label_decoder(x))
text_data.head()

Unnamed: 0,target,text
0,Negative,is upset that he can't update his Facebook by ...
1,Negative,@Kenichan I dived many times for the ball. Man...
2,Negative,my whole body feels itchy and like its on fire
3,Negative,"@nationwideclass no, it's not behaving at all...."
4,Negative,@Kwesidei not the whole crew


2. Remove punctuations and stopwords from the text in “text” column (2Marks)

In [9]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [10]:
def preprocess(text, stem=False):
  text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else:
        tokens.append(token)
  return " ".join(tokens)

In [11]:
%%timeit
# takes around 6 min to run
text_data['text'] = text_data['text'].apply(lambda x: preprocess(x))
text_data.head()

50.7 s ± 3.45 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
text_data.to_csv('./preprocessed_data.csv')

In [None]:
#text_data=pd.read_csv('./preprocessed_data.csv')

3. Create two objects X and y. X will be the 'text’ column of Dataframe and y will be the 'target' column. Create a CountVectorizer object and split the data into training and testing sets. Train a MultinomialNB model and Display the confusion Matrix (5 Marks)

In [13]:
X = text_data['text']
y = text_data['target']

In [14]:
X

0          upset update facebook texting might cry result...
1          dived many times ball managed save 50 rest go ...
2                           whole body feels itchy like fire
3                                           behaving mad see
4                                                 whole crew
                                 ...                        
1599994                        woke school best feeling ever
1599995             thewdb com cool hear old walt interviews
1599996                      ready mojo makeover ask details
1599997    happy 38th birthday boo alll time tupac amaru ...
1599998    happy charitytuesday thenspcc sparkscharity sp...
Name: text, Length: 1599999, dtype: object

In [15]:
y

0          Negative
1          Negative
2          Negative
3          Negative
4          Negative
             ...   
1599994    Positive
1599995    Positive
1599996    Positive
1599997    Positive
1599998    Positive
Name: target, Length: 1599999, dtype: object

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [17]:
print("Shape of X_train", X_train.shape)
print("Shape of y_train", y_train.shape)

print("Shape of X_test", X_test.shape)
print("Shape of y_test", y_test.shape)

Shape of X_train (1071999,)
Shape of y_train (1071999,)
Shape of X_test (528000,)
Shape of y_test (528000,)


In [18]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

text_clf = MultinomialNB()

text_clf

MultinomialNB()

In [19]:
import time

t1 = time.time()
vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer = 'word')

X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

text_clf.fit(X_train_vect, y_train)
t2 = time.time()
print(f"Time taken to run: {(t2-t1)} seconds")

Time taken to run: 81.79334163665771 seconds


In [20]:
from sklearn.metrics import classification_report

preds = text_clf.predict(X_test_vect)
print(classification_report(y_test, preds, digits=2))

              precision    recall  f1-score   support

    Negative       0.76      0.80      0.78    263320
    Positive       0.79      0.75      0.77    264680

    accuracy                           0.77    528000
   macro avg       0.78      0.78      0.77    528000
weighted avg       0.78      0.77      0.77    528000



4. Display the HMM POS tagging on the first 4 rows of ‘text’ (2 Marks)



In [21]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thappana\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\thappana\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [22]:
def read_data(df):
    df = df.reset_index(drop = True) 
    return df.sample(frac = 1)

text_data_sample = read_data(text_data)

def get_pos_tag(text):
    lower_case = text.lower()
    tokens = nltk.word_tokenize(lower_case)
    tags = nltk.pos_tag(tokens)
    print(tags)
    print()
    
text_data_sample = read_data(text_data)
text_data_sample.head(4)["text"].apply(get_pos_tag)

[('hate', 'NN'), ('sleeping', 'VBG'), ('alone', 'RB'), ('california', 'NN'), ('king', 'VBG'), ('sized', 'VBN'), ('bed', 'NN'), ('wish', 'NN')]

[('f1', 'NN'), ('fair', 'NN')]

[('today', 'NN'), ('going', 'VBG'), ('prue', 'JJ'), ('100', 'CD'), ('hard', 'JJ'), ('work', 'NN'), ('msn', 'NN'), ('fb', 'NN'), ('twitter', 'NN')]

[('geez', 'NN'), ('sucks', 'NNS')]



270196     None
730322     None
1470779    None
694452     None
Name: text, dtype: object

5. Parse the first 4 rows of ‘text’ using Viterbi Parser [Use toy_pcfg1 and toy_pcfg2 to get the probabilistic context free grammars; use the PCFG suitable for each sentence] (3 marks)

In [23]:
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys
    import time

    from nltk import tokenize
    from nltk.grammar import PCFG
    from nltk.parse import ViterbiParser

    toy_pcfg1 = PCFG.fromstring(
        """
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """
    )

    toy_pcfg2 = PCFG.fromstring(
        """
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """
    )

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ("I saw the man with my telescope", toy_pcfg1),
        ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
    ]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print(f"{i + 1:>3}: {demos[i][0]}")
        print("     %r" % demos[i][1])
        print()
    print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print("Bad sentence number")
        return

    # Tokenize the sentence.
    tokens = sent.split()

    parser = ViterbiParser(grammar)
    all_parses = {}

    print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    time = time.time() - t
    average = (
        reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
    )
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print("Time (secs)   # Parses   Average P(parse)")
    print("-----------------------------------------")
    print("%11.4f%11d%19.14f" % (time, num_parses, average))
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print("------------------------------------------")
    print("%11s%11d%19.14f" % ("n/a", len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print("Draw parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        from nltk.draw.tree import draw_trees

        print("  please wait...")
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print("Print parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        for parse in parses:
            print(parse)

In [24]:
demo()


  1: I saw the man with my telescope
     <Grammar with 17 productions>

  2: the boy saw Jack with Bob under the table with a telescope
     <Grammar with 23 productions>

Which demo (1-2)?  Bad sentence number


In [25]:
# Downloading and importing the brown corpus
nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\thappana\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


In [26]:
# Getting the tagged sentences
sent_tag = brown.tagged_sents()
mod_sent_tag = []
for s in sent_tag:
  s.insert(0,('##','##'))
  s.append(('&&','&&'))
  mod_sent_tag.append(s)

In [27]:
# Splitting the data for train and test
split_num = int(len(mod_sent_tag)*0.9)
train_data = mod_sent_tag[0:split_num]
test_data = mod_sent_tag[split_num:]

In [28]:
# Creating a dictionary whose keys are tags and values contain words which were assigned the correspoding tag
# ex:- 'TAG':{word1: count(word1,'TAG')}
train_word_tag = {}
for s in train_data:
  for (w,t) in s:
    w=w.lower()
    try:
      try:
        train_word_tag[t][w]+=1
      except:
        train_word_tag[t][w]=1
    except:
      train_word_tag[t]={w:1}

In [29]:
# Calculating the emission probabilities using train_word_tag
train_emission_prob={}
for k in train_word_tag.keys():
  train_emission_prob[k]={}
  count = sum(train_word_tag[k].values())
  for k2 in train_word_tag[k].keys():
    train_emission_prob[k][k2]=train_word_tag[k][k2]/count

In [30]:
# Estimating the bigram of tags to be used for transition probability
bigram_tag_data = {}
for s in train_data:
  bi=list(nltk.bigrams(s))
  for b1,b2 in bi:
    try:
      try:
        bigram_tag_data[b1[1]][b2[1]]+=1
      except:
        bigram_tag_data[b1[1]][b2[1]]=1
    except:
      bigram_tag_data[b1[1]]={b2[1]:1}

In [31]:
# Calculating the probabilities of tag bigrams for transition probability  
bigram_tag_prob={}
for k in bigram_tag_data.keys():
  bigram_tag_prob[k]={}
  count=sum(bigram_tag_data[k].values())
  for k2 in bigram_tag_data[k].keys():
    bigram_tag_prob[k][k2]=bigram_tag_data[k][k2]/count

In [32]:
# Calculating the possible tags for each word
# Note: Here we have used the whole data(Train+Test)
# Reason: There may be some words which are not present in train data but are present in test data 
tags_of_tokens = {}
count=0
for s in train_data:
  for (w,t) in s:
    w=w.lower()
    try:
      if t not in tags_of_tokens[w]:
        tags_of_tokens[w].append(t)
    except:
      l = []
      l.append(t)
      tags_of_tokens[w] = l
        
for s in test_data:
  for (w,t) in s:
    w=w.lower()
    try:
      if t not in tags_of_tokens[w]:
        tags_of_tokens[w].append(t)
    except:
      l = []
      l.append(t)
      tags_of_tokens[w] = l

In [33]:
# Dividing the test data into test words and test tags
test_words=[]
test_tags=[]
for s in test_data:
  temp_word=[]
  temp_tag=[]
  for (w,t) in s:
    temp_word.append(w.lower())
    temp_tag.append(t)
  test_words.append(temp_word)
  test_tags.append(temp_tag)

In [34]:
#Executing the Viterbi Algorithm
predicted_tags = []                #intializing the predicted tags
for x in range(len(test_words)):   # for each tokenized sentence in the test data
  s = test_words[x]
  #storing_values is a dictionary which stores the required values
  #ex: storing_values = {step_no.:{state1:[previous_best_state,value_of_the_state]}}                
  storing_values = {}              
  for q in range(len(s)):
    step = s[q]
    #for the starting word of the sentence
    if q == 1:                
      storing_values[q] = {}
      tags = tags_of_tokens[step]
      for t in tags:
        #this is applied since we do not know whether the word in the test data is present in train data or not
        try:
          storing_values[q][t] = ['##',bigram_tag_prob['##'][t]*train_emission_prob[t][step]]
        #if word is not present in the train data but present in test data we assign a very low probability of 0.0001
        except:
          storing_values[q][t] = ['##',0.0001]#*train_emission_prob[t][step]]
    
    #if the word is not at the start of the sentence
    if q>1:
      storing_values[q] = {}
      previous_states = list(storing_values[q-1].keys())   # loading the previous states
      current_states  = tags_of_tokens[step]               # loading the current states
      #calculation of the best previous state for each current state and then storing
      #it in storing_values
      for t in current_states:                             
        temp = []
        for pt in previous_states:                         
          try:
            temp.append(storing_values[q-1][pt][1]*bigram_tag_prob[pt][t]*train_emission_prob[t][step])
          except:
            temp.append(storing_values[q-1][pt][1]*0.0001)
        max_temp_index = temp.index(max(temp))
        best_pt = previous_states[max_temp_index]
        storing_values[q][t]=[best_pt,max(temp)]

  #Backtracing to extract the best possible tags for the sentence
  pred_tags = []
  total_steps_num = storing_values.keys()
  last_step_num = max(total_steps_num)
  for bs in range(len(total_steps_num)):
    step_num = last_step_num - bs
    if step_num == last_step_num:
      pred_tags.append('&&')
      pred_tags.append(storing_values[step_num]['&&'][0])
    if step_num<last_step_num and step_num>0:
      pred_tags.append(storing_values[step_num][pred_tags[len(pred_tags)-1]][0])
  predicted_tags.append(list(reversed(pred_tags)))

In [35]:
#Calculating the accuracy based on tagging each word in the test data.
right = 0 
wrong = 0
for i in range(len(test_tags)):
  gt = test_tags[i]
  pred = predicted_tags[i]
  for h in range(len(gt)):
    if gt[h] == pred[h]:
      right = right+1
    else:
      wrong = wrong +1 

print('Accuracy on the test data is: ',right/(right+wrong))
print('Loss on the test data is: ',wrong/(right+wrong))

Accuracy on the test data is:  0.9214105322233654
Loss on the test data is:  0.07858946777663456


In [69]:
import nltk
from nltk.corpus import brown
import numpy as np
from collections import Counter
from collections import defaultdict
from math import log
import time

stime = time.time()
sentences = np.array(brown.tagged_sents())
words = brown.tagged_words()
tokens,taged = zip(*words)

#
# firstdict = {}
# firstSum = len(sentences)
# for i in sentences:
#     x,y = i[0]
#     if y not in firstdict.keys():
#         firstdict[y] = 1
#     else:
#         firstdict[y] += 1
#
# for i in firstdict.keys():
#     firstdict[i] = firstdict[i]/firstSum

# total word count
total = len(words)

# preping corpus data
wordcount = Counter(tokens)
tokenTags = defaultdict(Counter)
for token, tag in words:
    tokenTags[token][tag] += 1

tagcount = Counter(taged)
for i in tagcount.keys():
    tagcount[i] = tagcount[i]/total

bgram = nltk.ngrams(taged,2)
tagtags = defaultdict(Counter)
for tag1, tag2 in bgram:
    tagtags[tag1][tag2] += 1


#viterbi implementation
trans = {}
StateProbs = {}
def viterbi(prior,transition,num):
    a = []
    trans[num] = []
    StateProbs[num+1] = []
    emmision = tokenTags[test[num]]
    wn =wordcount[test[num]]
    p = {}
    for ik,ii in emmision.items():
        #hold probs
        min = 100000
        for jk,ji in prior.items():
            if transition[jk][ik] != 0:
                if num==0:
                    prob = log((ii/wn),2)  + (log((transition[jk][ik]/(total-1)), 2))
                else:
                    prob = ji + log((ii/wn), 2) + (log((transition[jk][ik]/(total-1)), 2))
                trans[num].append([(jk,ik),prob])
                if min > prob:
                    min = prob
        p[ik] = min
        StateProbs[num+1].append([ik,min])
    return p

#sentence and test
#test = ['Time','flies','like','an','arrow','.']
test = text_data.sample()["text"].values.tolist()[0].split(' ')
print("test: ", test)
prev = viterbi(tagcount,tagtags,0)
for i in range(1,len(test)):
    prev = viterbi(prev,tagtags,i)

del trans[0]

prevP = 0
prev = ''
order = []

#backpropogation
for i in range(len(test)-1,-1,-1):
    if i == len(test)-1:
        min = 100000000
        for j in StateProbs[i+1]:
            if min > j[1]:
                prev = j[0]
                min = j[1]
                prevP = j[1]
        order.append(prev)
    else:
        for g in trans[i+1]:
            if prevP == g[1]:
                x,y = g[0]
                prev = x
                order.append(prev)
        for k in StateProbs[i+1]:
            if k[0] == prev:
                prevP = k[1]

#solution
sol = []
for i in reversed(order):
    sol.append(i)
print(sol)
print("--- %s seconds ---" % (time.time() - stime))
for t in test:
    print(t,end=',')
    print()
list_zip = zip(test, reversed(order)) 
print(list(list_zip))

  sentences = np.array(brown.tagged_sents())


test:  ['taboooo', 'fun']
['NN']
--- 20.37156891822815 seconds ---
taboooo,
fun,
[('taboooo', 'NN')]


In [43]:
test = text_data_sample.head(1)["text"].str.split(" ")
print("test: ", test)

test:  270196    [hate, sleeping, alone, california, king, size...
Name: text, dtype: object


In [68]:
row = text_data.sample()["text"].values.tolist()[0].split(' ')
row

['sprint', 'store', 'see', 'figure', 'camera', 'work']