## Imports

In [2]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re

## 1a. Download and  load Data

In [11]:
#File Locations
amazonFileName = "sentiment labelled sentences/amazon_cells_labelled.txt"
yelpFileName = "sentiment labelled sentences/yelp_labelled.txt"
imdbFileName = "sentiment labelled sentences/imdb_labelled.txt"

In [159]:
#Special case - imdb, both tabs and newlines need to be taken care of
def cleanFile(fname):
    f = open(fname,"r")
    cleanOutput=[]
    for line in f:
        line = line.replace("\n","")
        cleanOutput.append(line.split("\t"))
    return cleanOutput

In [160]:
#Import Data into Pandas
amazon = pd.read_csv(amazonFileName, sep="\t",header=None).dropna().reset_index().drop("index",axis =1)
yelp = pd.read_csv(yelpFileName, sep="\t", header=None,encoding="utf-8").dropna().reset_index().drop("index", axis=1)

cleanImdb = cleanFile(imdbFileName)
imdb = pd.DataFrame(cleanImdb)

In [161]:
print amazon

                                                     0  1
0    So there is no way for me to plug it in here i...  0
1                          Good case, Excellent value.  1
2                               Great for the jawbone.  1
3    Tied to charger for conversations lasting more...  0
4                                    The mic is great.  1
5    I have to jiggle the plug to get it to line up...  0
6    If you have several dozen or several hundred c...  0
7          If you are Razr owner...you must have this!  1
8                  Needless to say, I wasted my money.  0
9                     What a waste of money and time!.  0
10                     And the sound quality is great.  1
11   He was very impressed when going from the orig...  1
12   If the two were seperated by a mere 5+ ft I st...  0
13                            Very good quality though  1
14   The design is very odd, as the ear "clip" is n...  0
15   Highly recommend for any one who has a blue to...  1
16            

## 1.b Preprocessing Strategy

In [162]:
def lowerCase(x):
    """
    x is a string. Every letter in x is turned to lowercase
    """
    return x.lower()

def lemmatize(lmt,x):
    """
    It takes a word x and the lemmatizer lmt from nltk. Lmt lemmatizes x
    """
    return lmt.lemmatize(x)

def stripPunctuation(s):
    """
    It takes a string x and uses regular expression library to remove punctuation from it.
    Reference: https://www.quora.com/How-do-I-remove-punctuation-from-a-Python-string
    """
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', s)


def getStopWords():
    # Do all stop words need to be excluded. Should we keep negatives like no or not? - Francesco 10/29.
    return set(stopwords.words("english")) 

def preProcessing(x):
    processedString = []
    lemmatizer = WordNetLemmatizer()
    stopWords = getStopWords()
    xLowerCase = lowerCase(x)
    xWithoutPunct = stripPunctuation(xLowerCase)
    
    for word in xWithoutPunct.split():
        try:
            if unicode(word) not in stopWords:
                lemmatizedWord = lemmatize(lemmatizer, unicode(word))
                processedString.append(str(lemmatizedWord))
        except (UnicodeDecodeError, UnicodeEncodeError):
            print "UnicodeDecodeError: " + word
       
    return processedString    

In [163]:
"""
Apply the preprocessing method on the sentences in the amazon,yelp and imdb dataframes.
"""
amazon[0] = amazon[0].map(preProcessing)
yelp[0] = yelp[0].map(preProcessing)
imdb[0] = imdb[0].map(preProcessing)

UnicodeDecodeError: fiancé
UnicodeDecodeError: café
UnicodeDecodeError: crêpe
UnicodeDecodeError: puréed
UnicodeDecodeError: 
UnicodeDecodeError: québec
UnicodeDecodeError: iswas
UnicodeDecodeError: 
UnicodeDecodeError: clichés
UnicodeDecodeError: clichés
UnicodeDecodeError: aurvåg
UnicodeDecodeError: 
UnicodeDecodeError: problemsthe
UnicodeDecodeError: 
UnicodeDecodeError: clichés
UnicodeDecodeError: 
UnicodeDecodeError: seeing


In [196]:
"""
The label values in the imdb dataframe need to casted to ints
"""
imdb[1] = imdb[1].map(int)

## 1.c Split training and testing set

### Amazon

In [186]:
amazonPositiveTrain = amazon[amazon[1] == 1].head(400)
amazonPositiveTest = amazon[amazon[1] == 1].head(100)
amazonNegativeTrain = amazon[amazon[1] == 0].head(400)
amazonNegativeTest = amazon[amazon[1] == 0].head(100)

### Yelp

In [187]:
yelpPositiveTrain = yelp[yelp[1] == 1].head(400)
yelpPositiveTest = yelp[yelp[1] == 1].head(100)
yelpNegativeTrain = yelp[yelp[1] == 0].head(400)
yelpNegativeTest = yelp[yelp[1] == 0].head(100)

### Imdb

In [197]:
imdbPositiveTrain = imdb[imdb[1] == 1].head(400)
imdbPositiveTest = imdb[imdb[1] == 1].head(100)
imdbNegativeTrain = imdb[imdb[1] == 0].head(400)
imdbNegativeTest = imdb[imdb[1] == 0].head(100)

In [198]:
trainFrames = [amazonPositiveTrain,amazonNegativeTrain,
               yelpPositiveTrain,yelpNegativeTrain,
               imdbPositiveTrain,imdbNegativeTrain]

testFrames = [amazonPositiveTest,amazonNegativeTest,
              yelpPositiveTest,yelpNegativeTest,
              imdbPositiveTest,imdbNegativeTest]

trainDF = pd.concat(trainFrames).reset_index().drop("index",axis=1)
testDF =  pd.concat(testFrames).reset_index().drop("index",axis=1)

In [199]:
print trainDF.shape
print testDF.shape

(2400, 2)
(600, 2)


## 1.d Bag of Words model

In [260]:
def generateBag(reviews):
    """
    words is a list of words. The function creates a dictionary of unique words
    """
    wordBag ={}
    for review in reviews:
        for word in review:
            if word not in wordBag.keys():
                wordBag[word] = 0
    return wordBag

def indexBag(bag):
    """
    bag is the input. The function essentially assigns an index for each word in the bag.The index will be needed
    for counting the frequency of its word and create the correct feature vector.
    """
    idx = 0
    for word in bag.keys():
        bag[word] = idx
        idx+=1
    return bag

def wordFrequency(reviews,bag):
    """
    iterates through a review and using the bag, it returns the feature vector of the review.
    """
    featureVectorList = []
    for review in reviews:
        featureVec = [0 for uniqueWord in range(0,len(bag.keys()))]
        for word in review:
            if word in bag.keys():
                #update the value in featureVec whose index is found in bag[word]
                featureVec[bag[word]]+=1
            else:
                print "Error: " + word + " is not in the bag."
        featureVectorList.append(featureVec)
    return pd.DataFrame(featureVectorList,columns = bag.keys())

In [255]:
#generate bag only from Training Data
bagOfWords = generateBag(trainDF[0].tolist())
indexedBag = indexBag(bagOfWords)

In [261]:
#generate all the feature vectors for Training and Test Data
trainingVecs = wordFrequency(trainDF[0].tolist(),indexedBag)
testVecs = wordFrequency(testDF[0].tolist(),indexedBag)

In [263]:
#Report feature vectors any two reviews
print trainingVecs.head(2)

   limited  versatile  secondly  magnetic  personally  bear  yellow  sleek  \
0        0          0         0         0           0     0       0      0   
1        0          0         0         0           0     0       0      0   

   four  sleep   ...     jewel  forgetting  portion  pandering  compete  \
0     0      0   ...         0           0        0          0        0   
1     0      0   ...         0           0        0          0        0   

   monstrous  searched  yell  abhor  jawbone  
0          0         0     0      0        0  
1          0         0     0      0        1  

[2 rows x 4075 columns]


## 1.e Postprocessing Strategy