In [3]:
# -------------------Part 1 of Yelp Project: Create a Class to Pre-Process reviews and ratings------------------

# First, import essential packages
import requests
import bs4
from bs4 import BeautifulSoup
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

import pandas as pd
sns.set()

In [73]:
# Instantiate the stopwords and lemmatizer and modifiers

# We will extract stop words from a website, and then lemmatize them
lemma = WordNetLemmatizer()
allowed_modifiers = ['J', 'R', 'C']
nouns = ['N']
stops = set(stopwords.words('english'))

In [74]:
# This will covert Treebank Tags to Wordnet POS Tags

from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    elif treebank_tag.startswith('P'):
        return wordnet.NOUN
    else:
        return ''

In [75]:
# first we import the Rotten Tomatoes dataset imported from Kaggle 
data_dir = '/Users/rabeya/computational-text-analysis-master-2020/day-2/'
train_data = pd.read_csv(data_dir+'train.tsv', sep='\t')
test_data = pd.read_csv(data_dir+'test.tsv', sep='\t')

In [76]:
train_data.shape,test_data.shape

((156060, 4), (66292, 3))

In [77]:
# The labels are:
{0: 'negative',
1: 'somewhat negative',
2: 'neutral',
3: 'somewhat positive',
4: 'positive'}

{0: 'negative',
 1: 'somewhat negative',
 2: 'neutral',
 3: 'somewhat positive',
 4: 'positive'}

In [78]:
train_data.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


First, we should calculate some basic statistics of the data-set
before implementing any machine-learning algorithm/model. 
Our example, let's just calculate the frequency of labels that are in this data.
That alone could give us important information. 

In [79]:
train_data['Sentiment'].value_counts(normalize=True)

2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64

Already we see that the classes are not balanced. 50% of the reviews are just for the neutral label, and the labels that are negative or positive are only about 4-5%. Usually, you could go on
by trying to balance the data-set by either under-sampling the larger class-label data
or artificially over-sampling it. But instead, we're just going to leave the 
unbalanced data-set as it is, since we'll be using a general inference technique that 
doesn't care for balanced or unbalanced data.

First, what we need to do is convert each data-record in the "Phrase" column into word tokens, and remove all stopwords, and lower-case all the words. 

In [80]:
import string
symbols = string.punctuation 
stops = stopwords.words('english')

def clean_sentence(s):
    words = word_tokenize(s)
    words = [w.lower() for w in words]
    words = [w for w in words if w not in symbols]
    # lets remove stopword 
    words = [w for w in words if w not in stops]
    # now let's lemmatize the words!
    words = [lemma.lemmatize(pair[0], get_wordnet_pos(pair[1])) 
         for pair in pos_tag(words) 
         if (get_wordnet_pos(pair[1]) not in '')]
    # finally lets unique-fy the set
    words = list(set(words))
    return words
    

In [81]:
train_data['Phrase'][0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [87]:
# lets use the clean_sentence() function to clean all sentences in the 'Phrase' column
for i in range(len(train_data['Phrase'])):
    sent = train_data['Phrase'][i]
    train_data['Phrase'][i] = clean_sentence(sent)
    i = i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


KeyboardInterrupt: 

In [None]:
train_data['Phrase'][0]

In [None]:
# -------------- This section is CountVectorizer: DTM Matrix (word-frequency) Analysis --------------
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer(max_features=5000, binary=True)
sparse_dtm = countvec.fit_transform(all_reviews['reviews'])
features_dtm = sparse_dtm.toarray()
DTM_table = pd.DataFrame(features_dtm, columns=countvec.get_feature_names())


In [None]:
# this si our variable data
response = all_reviews['opinions'].values
X_dtm, y_dtm = features_dtm, response
review_df = pd.DataFrame(X_dtm, ...)



In [None]:
# ---------------- Section 2: TF-IDF Matrix Vectorizer to analyze Word importance -----------------

# First, create the TF-IDF Vectorizer Object
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvec = TfidfVectorizer()
sparse_tfidf = tfidfvec.fit_transform(all_reviews['reviews'])

#Next, create the TF-IDF feature matrix
#tfidf = pd.DataFrame(sparse_tfidf.toarray(), columns=tfidfvec.get_feature_names(), index=all_reviews.index)
features_tfidf = sparse_tfidf.toarray()

In [159]:
# Question: can we engineer a NEW dataset with new features,
# but derived from the old data?

# This new dataset would have 4 features:
# - 1) #(pos adj) / #(neg adj) 
# - 2) #(pos adv) / #(neg adv) 
# - 3) #(pos verb) / #(neg verb)
# - 4) # miscellanous words (figure this out in detail)

# Feature 1: first we need a list of almost every (+) english adjective, lemmatized

# First, we still need to process all the reviews and remove stopwords, unnecsary words 
# (which ontribute no meaning), lemmatize each word according to its pos_tag, and actually
# pos_tag the words

# Then when we process the reviews, we need to use Word2Vec to compare them to
# bag of positive and negative words in our vocabulary

