# Import library

In [1]:
import pandas as pd
import re
import nltk
nltk.download('popular')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\quain\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\quain\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\quain\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\quain\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\quain\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

# Load data

In [2]:
df_bottom = pd.read_csv("Crawling data/bottom_movie_reviews.csv")
df_top = pd.read_csv("Crawling data/top_movie_reviews.csv")

In [3]:
df_full = pd.concat([df_top, df_bottom], ignore_index=True)
df_full

Unnamed: 0,user_name,title,rating,date,review
0,andrewburgereviews,"An offer so good, I couldn't refuse",10,1 April 2019,It is now past 1 PM and I just finished watchi...
1,gogoschka-1,"For Me, This Is The Definitive Film",10,11 February 2018,This isn't just a beautifully crafted gangster...
2,Sleepin_Dragon,One of the best of all time.,10,6 February 2021,"A masterclass in film making, is The Godfather..."
3,alexkolokotronis,An Iconic Film,10,21 June 2008,Tell me a movie that is more famous than this....
4,b-a-h TNT-6,"Another kind of ""family movie""",10,5 March 2002,The Godfather is one of the few films in which...
...,...,...,...,...,...
13376,McStubby,Overrated in its badness,6,5 June 2021,Definitely not the worst movie ever. People wh...
13377,scottfilm,Absolute Crap!!! Needs a Remake,1,6 September 2006,"The idea of this film was interesting, but the..."
13378,Toxikfoxx,Troll 2...,1,7 August 2001,I will admit that Troll 2 was so insanely grea...
13379,markus_l-37396,Troll muthafucin 2 is tha greatest movie evner...,10,27 June 2022,Troll 2 is the first movie in the GOBLIN franc...


In [4]:
raw_data = df_full[["review"]]
raw_data

Unnamed: 0,review
0,It is now past 1 PM and I just finished watchi...
1,This isn't just a beautifully crafted gangster...
2,"A masterclass in film making, is The Godfather..."
3,Tell me a movie that is more famous than this....
4,The Godfather is one of the few films in which...
...,...
13376,Definitely not the worst movie ever. People wh...
13377,"The idea of this film was interesting, but the..."
13378,I will admit that Troll 2 was so insanely grea...
13379,Troll 2 is the first movie in the GOBLIN franc...


# Pre-process

## Tokenzize sentences

In [5]:
sentences_data = []
for review in raw_data['review']:
    tokens = nltk.sent_tokenize(review)
    sentences_data.extend(tokens)
data = pd.DataFrame({"review": sentences_data})

In [6]:
data

Unnamed: 0,review
0,It is now past 1 PM and I just finished watchi...
1,I should probably go to bed.
2,It's late and tomorrow I have to wake up a bit...
3,But not early enough to postpone writing these...
4,"Now that I have seen it three times, the oppor..."
...,...
172697,I watched this film because I wanted so see a ...
172698,"But everyone, who wants a really good film wit..."
172699,The trolls in this film look really funny and ...
172700,"But it wasn't a horror movie for me, I never w..."


## Remove special symbols

In [7]:
data['cleaned_review'] = data['review'].apply(lambda t: re.sub('[^A-Za-z]+', ' ', t))
data.head()

Unnamed: 0,review,cleaned_review
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...
1,I should probably go to bed.,I should probably go to bed
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...


## POS tagging

In [8]:
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def POS_tagging(text):
  tags = pos_tag(word_tokenize(text))
  list_tag = []
  for word, tag in tags:
      if word.lower() not in set(stopwords.words('english')):
        list_tag.append(tuple([word, pos_dict.get(tag[0])]))
  return list_tag

In [9]:
data['pos_tagged'] = data['cleaned_review'].apply(POS_tagging)
data.head()

Unnamed: 0,review,cleaned_review,pos_tagged
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...,"[(past, a), (PM, n), (finished, v), (watching,..."
1,I should probably go to bed.,I should probably go to bed,"[(probably, r), (go, v), (bed, v)]"
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...,"[(late, a), (tomorrow, n), (wake, v), (bit, r)..."
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...,"[(early, r), (enough, r), (postpone, v), (writ..."
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...,"[(seen, v), (three, None), (times, v), (opport..."


## Tokenize lemmatized sentences

In [10]:
def lemmatize(pos_data):
    lemma_sentence = " "
    for item in pos_data:
        lemma = WordNetLemmatizer().lemmatize(item[0])
        lemma_sentence = lemma_sentence + " " + lemma
    return lemma_sentence

data['lemma'] = data['pos_tagged'].apply(lemmatize)
data['lemma_words'] = data['lemma'].apply(word_tokenize)
data.head()

Unnamed: 0,review,cleaned_review,pos_tagged,lemma,lemma_words
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...,"[(past, a), (PM, n), (finished, v), (watching,...",past PM finished watching Francis Ford Coppo...,"[past, PM, finished, watching, Francis, Ford, ..."
1,I should probably go to bed.,I should probably go to bed,"[(probably, r), (go, v), (bed, v)]",probably go bed,"[probably, go, bed]"
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...,"[(late, a), (tomorrow, n), (wake, v), (bit, r)...",late tomorrow wake bit early,"[late, tomorrow, wake, bit, early]"
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...,"[(early, r), (enough, r), (postpone, v), (writ...",early enough postpone writing line,"[early, enough, postpone, writing, line]"
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...,"[(seen, v), (three, None), (times, v), (opport...",seen three time opportunity sharing thought ...,"[seen, three, time, opportunity, sharing, thou..."


## Sentiment rule based labeling

In [11]:
def wordnet_sentiment_analysis(pos_data):
    sentiment = 0
    tokens_count = 0
    for word, pos in pos_data:
        if not pos:
            continue
        lemma = WordNetLemmatizer().lemmatize(word, pos=pos)
        if not lemma:
            continue
        synsets = wordnet.synsets(lemma, pos=pos)
        if not synsets:
            continue
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
        tokens_count += 1
    if not tokens_count:
        return None
    if sentiment>0:
        return "positive"
    if sentiment==0:
        return "neutral"
    else:
        return "negative"

In [12]:
data['polarity'] = data['pos_tagged'].apply(wordnet_sentiment_analysis)
data.head()

Unnamed: 0,review,cleaned_review,pos_tagged,lemma,lemma_words,polarity
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...,"[(past, a), (PM, n), (finished, v), (watching,...",past PM finished watching Francis Ford Coppo...,"[past, PM, finished, watching, Francis, Ford, ...",negative
1,I should probably go to bed.,I should probably go to bed,"[(probably, r), (go, v), (bed, v)]",probably go bed,"[probably, go, bed]",neutral
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...,"[(late, a), (tomorrow, n), (wake, v), (bit, r)...",late tomorrow wake bit early,"[late, tomorrow, wake, bit, early]",neutral
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...,"[(early, r), (enough, r), (postpone, v), (writ...",early enough postpone writing line,"[early, enough, postpone, writing, line]",positive
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...,"[(seen, v), (three, None), (times, v), (opport...",seen three time opportunity sharing thought ...,"[seen, three, time, opportunity, sharing, thou...",positive


In [13]:
data.count()

review            172702
cleaned_review    172702
pos_tagged        172702
lemma             172702
lemma_words       172702
polarity          163684
dtype: int64

In [14]:
data['polarity'].value_counts()

positive    81576
negative    46512
neutral     35596
Name: polarity, dtype: int64

In [15]:
data.to_csv('labeled_data.csv',index=False)