## Imports

In [65]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re
from sklearn import preprocessing
import random
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
import scipy

## 1a. Download and  load Data

In [2]:
#File Locations
amazonFileName = "sentiment labelled sentences/amazon_cells_labelled.txt"
yelpFileName = "sentiment labelled sentences/yelp_labelled.txt"
imdbFileName = "sentiment labelled sentences/imdb_labelled.txt"

In [3]:
#Special case - imdb, both tabs and newlines need to be taken care of
def cleanFile(fname):
    f = open(fname,"r")
    cleanOutput=[]
    for line in f:
        line = line.replace("\n","")
        cleanOutput.append(line.split("\t"))
    return cleanOutput

In [4]:
#Import Data into Pandas
amazon = pd.read_csv(amazonFileName, sep="\t",header=None).dropna().reset_index().drop("index",axis =1)
yelp = pd.read_csv(yelpFileName, sep="\t", header=None,encoding="utf-8").dropna().reset_index().drop("index", axis=1)

cleanImdb = cleanFile(imdbFileName)
imdb = pd.DataFrame(cleanImdb)

In [5]:
print amazon

                                                     0  1
0    So there is no way for me to plug it in here i...  0
1                          Good case, Excellent value.  1
2                               Great for the jawbone.  1
3    Tied to charger for conversations lasting more...  0
4                                    The mic is great.  1
5    I have to jiggle the plug to get it to line up...  0
6    If you have several dozen or several hundred c...  0
7          If you are Razr owner...you must have this!  1
8                  Needless to say, I wasted my money.  0
9                     What a waste of money and time!.  0
10                     And the sound quality is great.  1
11   He was very impressed when going from the orig...  1
12   If the two were seperated by a mere 5+ ft I st...  0
13                            Very good quality though  1
14   The design is very odd, as the ear "clip" is n...  0
15   Highly recommend for any one who has a blue to...  1
16            

## 1.b Preprocessing Strategy

In [6]:
def lowerCase(x):
    """
    x is a string. Every letter in x is turned to lowercase
    """
    return x.lower()

def lemmatize(lmt,x):
    """
    It takes a word x and the lemmatizer lmt from nltk. Lmt lemmatizes x
    """
    return lmt.lemmatize(x)

def stripPunctuation(s):
    """
    It takes a string x and uses regular expression library to remove punctuation from it.
    Reference: https://www.quora.com/How-do-I-remove-punctuation-from-a-Python-string
    """
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', s)


def getStopWords():
    # Do all stop words need to be excluded. Should we keep negatives like no or not? - Francesco 10/29.
    return set(stopwords.words("english")) 

def preProcessing(x):
    processedString = []
    lemmatizer = WordNetLemmatizer()
    stopWords = getStopWords()
    xLowerCase = lowerCase(x)
    xWithoutPunct = stripPunctuation(xLowerCase)
    
    for word in xWithoutPunct.split():
        try:
            if unicode(word) not in stopWords:
                lemmatizedWord = lemmatize(lemmatizer, unicode(word))
                processedString.append(str(lemmatizedWord))
        except (UnicodeDecodeError, UnicodeEncodeError):
            print "UnicodeDecodeError: " + word
       
    return processedString    

In [7]:
"""
Apply the preprocessing method on the sentences in the amazon,yelp and imdb dataframes.
"""
amazon[0] = amazon[0].map(preProcessing)
yelp[0] = yelp[0].map(preProcessing)
imdb[0] = imdb[0].map(preProcessing)

UnicodeDecodeError: fiancé
UnicodeDecodeError: café
UnicodeDecodeError: crêpe
UnicodeDecodeError: puréed
UnicodeDecodeError: 
UnicodeDecodeError: québec
UnicodeDecodeError: iswas
UnicodeDecodeError: 
UnicodeDecodeError: clichés
UnicodeDecodeError: clichés
UnicodeDecodeError: aurvåg
UnicodeDecodeError: 
UnicodeDecodeError: problemsthe
UnicodeDecodeError: 
UnicodeDecodeError: clichés
UnicodeDecodeError: 
UnicodeDecodeError: seeing


In [8]:
"""
The label values in the imdb dataframe need to casted to ints
"""
imdb[1] = imdb[1].map(int)

## 1.c Split training and testing set

### Amazon

In [9]:
amazonPositiveTrain = amazon[amazon[1] == 1].head(400)
amazonPositiveTest = amazon[amazon[1] == 1].head(100)
amazonNegativeTrain = amazon[amazon[1] == 0].head(400)
amazonNegativeTest = amazon[amazon[1] == 0].head(100)

### Yelp

In [10]:
yelpPositiveTrain = yelp[yelp[1] == 1].head(400)
yelpPositiveTest = yelp[yelp[1] == 1].head(100)
yelpNegativeTrain = yelp[yelp[1] == 0].head(400)
yelpNegativeTest = yelp[yelp[1] == 0].head(100)

### Imdb

In [11]:
imdbPositiveTrain = imdb[imdb[1] == 1].head(400)
imdbPositiveTest = imdb[imdb[1] == 1].head(100)
imdbNegativeTrain = imdb[imdb[1] == 0].head(400)
imdbNegativeTest = imdb[imdb[1] == 0].head(100)

In [12]:
trainFrames = [amazonPositiveTrain,amazonNegativeTrain,
               yelpPositiveTrain,yelpNegativeTrain,
               imdbPositiveTrain,imdbNegativeTrain]

testFrames = [amazonPositiveTest,amazonNegativeTest,
              yelpPositiveTest,yelpNegativeTest,
              imdbPositiveTest,imdbNegativeTest]

trainDF = pd.concat(trainFrames).reset_index().drop("index",axis=1)
testDF =  pd.concat(testFrames).reset_index().drop("index",axis=1)

In [13]:
print trainDF.shape
print testDF.shape

(2400, 2)
(600, 2)


## 1.d Bag of Words model

In [14]:
def generateBag(reviews):
    """
    words is a list of words. The function creates a dictionary of unique words
    """
    wordBag ={}
    for review in reviews:
        for word in review:
            if word not in wordBag.keys():
                wordBag[word] = 0
    return wordBag

def indexBag(bag):
    """
    bag is the input. The function essentially assigns an index for each word in the bag.The index will be needed
    for counting the frequency of its word and create the correct feature vector.
    """
    idx = 0
    for word in bag.keys():
        bag[word] = idx
        idx+=1
    return bag

def wordFrequency(reviews,bag):
    """
    iterates through a review and using the bag, it returns the feature vector of the review.
    """
    featureVectorList = []
    for review in reviews:
        featureVec = [0.0 for uniqueWord in range(0,len(bag.keys()))]
        for word in review:
            if word in bag.keys():
                #update the value in featureVec whose index is found in bag[word]
                featureVec[bag[word]]+=1.0
            else:
                print "Error: " + word + " is not in the bag."
        featureVectorList.append(featureVec)
    return pd.DataFrame(featureVectorList,columns = bag.keys())

In [15]:
#generate bag only from Training Data
bagOfWords = generateBag(trainDF[0].tolist())
indexedBag = indexBag(bagOfWords)

In [16]:
#generate all the feature vectors for Training and Test Data
trainingVecs = wordFrequency(trainDF[0].tolist(),indexedBag)
testVecs = wordFrequency(testDF[0].tolist(),indexedBag)

In [17]:
#Report feature vectors any two reviews
print trainingVecs.head(2)

   limited  versatile  secondly  magnetic  personally  bear  yellow  sleek  \
0        0          0         0         0           0     0       0      0   
1        0          0         0         0           0     0       0      0   

   four  sleep   ...     jewel  forgetting  portion  pandering  compete  \
0     0      0   ...         0           0        0          0        0   
1     0      0   ...         0           0        0          0        0   

   monstrous  searched  yell  abhor  jawbone  
0          0         0     0      0        0  
1          0         0     0      0        1  

[2 rows x 4075 columns]


## 1.e Postprocessing Strategy

In [18]:
### L1 or lasso was chosen as the normalization method because it best suits sparse data.
def normalize(df):
    normalizedDf = preprocessing.normalize(df, norm='l1')
    return pd.DataFrame(normalizedDf)

In [19]:
normalizedTraining = normalize(trainingVecs)
normalizedTest = normalize(testVecs)

## 1.f Training set clustering (K-means implementation)

In [20]:
def getFirstMeans(trainData,k):
    firstMeans = []
    idxs = [x for x in range(len(trainData))]
    meanIdxs = random.sample(idxs,k)
    for idx in meanIdxs:
        firstMeans.append(trainData.iloc[idx].values)
    return firstMeans

def clustering(trainData,clusterMeans,k):
    rowIdxs = [i for i in range(len(trainData))]
    clusters = []
    for num in range(k):
        clusters.append([])
    
    for row in rowIdxs :
        distances = []
        for mean in clusterMeans:
            dist = np.linalg.norm(trainData.iloc[row] - mean)
            distances.append(dist)
        clusterIdx = np.argmin(distances)
        
        clusters[clusterIdx].append(row)
    return clusters
    
def newMeans(trainData,clusters):
    means = []
    for cluster in clusters:
        clusterMean = np.mean(trainData.iloc[cluster])
        means.append(clusterMean)
    return means

def KMeans(trainData,k):
    iters = 0
    initMeans = getFirstMeans(trainData,k)
    clusters1 = clustering(trainData,initMeans,k)
    means1 =  newMeans(trainData,clusters1)
    clusters2 = clustering(trainData,means1,k)
    means2 = newMeans(trainData,clusters2)
    
    while clusters1 != clusters2:
        print "iteration: " + str(iters+1)
        clusters1 = clustering(trainData,means2,k)
        means1 = newMeans(trainData,clusters1)
        clusters2 = clustering(trainData,means1,k)
        means2 = newMeans(trainData,clusters2)
        iters += 1
    
    return clusters2

    
    
    

In [21]:
k = 2
kMeansCluster = KMeans(normalizedTraining,k)

iteration: 1
iteration: 2
iteration: 3


### Report Results

In [22]:
def displayResults(dfTrain,clusters):
    dfTrain[1] = dfTrain[1].map(int)
    for i,cluster in enumerate(clusters):
        clusterData = dfTrain.iloc[cluster]
        clusterData[1] = clusterData[1].map(int)
        print
        print "Number of points in cluster " + str(i) + " :"  + str(len(cluster))
        print "Number of 1s in cluster " + str(i) + " :" + str(clusterData[clusterData[1] == 1].shape[0])
        print "percent of 1s in  cluster " + str(i) + " :" + str((clusterData[clusterData[1] == 1].shape[0])/float(len(cluster))) 
        print "Number of 0s in cluster " + str(i) + " :" + str(clusterData[clusterData[1] == 0].shape[0])
        print "percent of 0s in cluster " +  str(i) + " :" + str((clusterData[clusterData[1] == 0].shape[0])/float(len(cluster)))

In [23]:
displayResults(trainDF,kMeansCluster)


Number of points in cluster 0 :101
Number of 1s in cluster 0 :60
percent of 1s in  cluster 0 :0.594059405941
Number of 0s in cluster 0 :41
percent of 0s in cluster 0 :0.405940594059

Number of points in cluster 1 :2299
Number of 1s in cluster 1 :1140
percent of 1s in  cluster 1 :0.495867768595
Number of 0s in cluster 1 :1159
percent of 0s in cluster 1 :0.504132231405


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## 1.g Sentiment prediction

In [39]:
def logisticRegression(trainVecs,trainLabels,testVecs,testLabels):
    model = linear_model.LogisticRegression()
    modelFit = model.fit(trainVecs, trainLabels)
    score = modelFit.score(testVecs, testLabels)
    pred = modelFit.predict(testVecs)
    confusionMatrix = confusion_matrix(pred, testLabels)
    return score,confusionMatrix

In [40]:
score,confusionMatrix = logisticRegression(normalizedTraining,trainDF[1],normalizedTest,testDF[1])

In [41]:
print score
print confusionMatrix

0.843333333333
[[270  64]
 [ 30 236]]


## 1.h N-gram model

In [45]:
def generateBiGramDict(reviews):
    """
    words is a list of words. The function creates a dictionary of unique words
    """
    biGramDict ={}
    for review in reviews:
        for i in range(len(review)-1):
            biGram= review[i] + " " + review[i+1]
            if biGram not in  biGramDict.keys():
                biGramDict[biGram] = 0
    return biGramDict

def indexBiGram(d):
    """
    bag is the input. The function essentially assigns an index for each word in the bag.The index will be needed
    for counting the frequency of its word and create the correct feature vector.
    """
    idx = 0
    for biGram in d.keys():
        d[biGram] = idx
        idx+=1
    return d

def biGramFrequency(reviews,d):
    """
    iterates through a review and using the bag, it returns the feature vector of the review.
    """
    featureVectorList = []
    for review in reviews:
        featureVec = [0.0 for uniqueWord in range(0,len(d.keys()))]
        for i in range(len(review)-1):
            biGram= review[i] + " " + review[i+1]
            if biGram in d.keys():
                #update the value in featureVec whose index is found in bag[word]
                featureVec[d[biGram]]+=1.0
            else:
                print "Error: " + biGram + " not in dictionary."
        featureVectorList.append(featureVec)
    return pd.DataFrame(featureVectorList,columns = d.keys())

In [46]:
#generate bag only from Training Data
biGramDict = generateBiGramDict(trainDF[0].tolist())
indexedBiGram = indexBiGram(biGramDict)

In [49]:
#generate all the feature vectors for Training and Test Data
biGramTrainingVecs = biGramFrequency(trainDF[0].tolist(),indexedBiGram)
biGramTestVecs = biGramFrequency(testDF[0].tolist(),indexedBiGram)

In [50]:
#Report feature vectors any two reviews
print biGramTrainingVecs.head(2)

   tension medical  give chill  clock docking  scratched protective  \
0                0           0              0                     0   
1                0           0              0                     0   

   problem others  expression feeling  except cole  played period  \
0               0                   0            0              0   
1               0                   0            0              0   

   friend enjoy  eaten multiple     ...      lucy bell  phone mp3  slow take  \
0             0               0     ...              0          0          0   
1             0               0     ...              0          0          0   

   day week  must shakespear  touching character  barely lukewarm  headset pc  \
0         0                0                   0                0           0   
1         0                0                   0                0           0   

   later returned  prime time  
0               0           0  
1               0           0

### Run Logistic Regression on the normalized train and test vectors obtained from the Bi Gram model

In [51]:
normalizedBiGramTrainingVecs = normalize(biGramTrainingVecs)
normalizedBiGramTestVecs = normalize(biGramTestVecs)

In [56]:
biGramScore,biGramConfusionMatrix = logisticRegression(normalizedBiGramTrainingVecs,trainDF[1],
                                                       normalizedBiGramTestVecs,testDF[1])

In [57]:
print biGramScore
print biGramConfusionMatrix

0.976666666667
[[300  14]
 [  0 286]]


## 1.i PCA for bag of words model.

### PCA Algorithm

1. Center the data by subtracting the mean from it.
2. Calculate the covariance matrix.
3. Calculate the eigenvectors of the covariance matrix.

In [60]:
def center(x,mean):
    for row in x:
        row -= mean
    return x

In [61]:
trainingMean = normalizedTraining.mean()
testMean = normalizedTest.mean()

In [62]:
centeredTraining = center(normalizedTraining,trainingMean)
centeredTest = center(normalizedTest,testMean)

In [66]:
#compute SVD to get the eigenvectors and the eigenvalues
UTrain,DTrain, VTrain = scipy.linalg.svd(centeredTraining, full_matrices=False)
UTest, DTest, VTest = scipy.linalg.svd(centeredTest, full_matrices=False)