## Imports

In [1]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re
from sklearn import preprocessing
import random
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
import scipy
from sklearn import decomposition
import operator

## 1a. Download and  load Data

In [2]:
#File Locations
amazonFileName = "sentiment labelled sentences/amazon_cells_labelled.txt"
yelpFileName = "sentiment labelled sentences/yelp_labelled.txt"
imdbFileName = "sentiment labelled sentences/imdb_labelled.txt"

In [3]:
#Special case - imdb, both tabs and newlines need to be taken care of
def cleanFile(fname):
    f = open(fname,"r")
    cleanOutput=[]
    for line in f:
        line = line.replace("\n","")
        cleanOutput.append(line.split("\t"))
    return cleanOutput

In [4]:
#Import Data into Pandas
amazon = pd.read_csv(amazonFileName, sep="\t",header=None).dropna().reset_index().drop("index",axis =1)
yelp = pd.read_csv(yelpFileName, sep="\t", header=None,encoding="utf-8").dropna().reset_index().drop("index", axis=1)

cleanImdb = cleanFile(imdbFileName)
imdb = pd.DataFrame(cleanImdb)

In [5]:
def getLabelRatio(df):
    df[1] = df[1].map(int)
    reviews = df[df[1] == 0]
    zeroRatio = reviews.shape[0]/ float(df.shape[0])
    print "The file contains " + str(zeroRatio) + "% of 0s and " + str(1 - zeroRatio) + "% of 1s."

In [6]:
getLabelRatio(amazon)
getLabelRatio(yelp)
getLabelRatio(imdb)

The file contains 0.5% of 0s and 0.5% of 1s.
The file contains 0.5% of 0s and 0.5% of 1s.
The file contains 0.5% of 0s and 0.5% of 1s.


In [7]:
print imdb

                                                     0  1
0    A very, very, very slow-moving, aimless movie ...  0
1    Not sure who was more lost - the flat characte...  0
2    Attempting artiness with black & white and cle...  0
3         Very little music or anything to speak of.    0
4    The best scene in the movie was when Gerardo i...  1
5    The rest of the movie lacks art, charm, meanin...  0
6                                  Wasted two hours.    0
7    Saw the movie today and thought it was a good ...  1
8                                 A bit predictable.    0
9    Loved the casting of Jimmy Buffet as the scien...  1
10                And those baby owls were adorable.    1
11   The movie showed a lot of Florida at it's best...  1
12   The Songs Were The Best And The Muppets Were S...  1
13                                   It Was So Cool.    1
14   This is a very "right on case" movie that deli...  1
15   It had some average acting from the main perso...  0
16   This revi

## 1.b Preprocessing Strategy

In [8]:
def lowerCase(x):
    """
    x is a string. Every letter in x is turned to lowercase
    """
    return x.lower()

def lemmatize(lmt,x):
    """
    It takes a word x and the lemmatizer lmt from nltk. Lmt lemmatizes x
    """
    return lmt.lemmatize(x)

def stripPunctuation(s):
    """
    It takes a string x and uses regular expression library to remove punctuation from it.
    Reference: https://www.quora.com/How-do-I-remove-punctuation-from-a-Python-string
    """
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', s)


def getStopWords():
    # Do all stop words need to be excluded. Should we keep negatives like no or not? - Francesco 10/29.
    return set(stopwords.words("english")) 

def preProcessing(x):
    processedString = []
    lemmatizer = WordNetLemmatizer()
    stopWords = getStopWords()
    xLowerCase = lowerCase(x)
    xWithoutPunct = stripPunctuation(xLowerCase)
    
    for word in xWithoutPunct.split():
        try:
            if unicode(word) not in stopWords:
                lemmatizedWord = lemmatize(lemmatizer, unicode(word))
                processedString.append(str(lemmatizedWord))
        except (UnicodeDecodeError, UnicodeEncodeError):
            print "UnicodeDecodeError: " + word
       
    return processedString    

In [9]:
"""
Apply the preprocessing method on the sentences in the amazon,yelp and imdb dataframes.
"""
amazon[0] = amazon[0].map(preProcessing)
yelp[0] = yelp[0].map(preProcessing)
imdb[0] = imdb[0].map(preProcessing)

UnicodeDecodeError: fiancé
UnicodeDecodeError: café
UnicodeDecodeError: crêpe
UnicodeDecodeError: puréed
UnicodeDecodeError: 
UnicodeDecodeError: québec
UnicodeDecodeError: iswas
UnicodeDecodeError: 
UnicodeDecodeError: clichés
UnicodeDecodeError: clichés
UnicodeDecodeError: aurvåg
UnicodeDecodeError: 
UnicodeDecodeError: problemsthe
UnicodeDecodeError: 
UnicodeDecodeError: clichés
UnicodeDecodeError: 
UnicodeDecodeError: seeing


In [10]:
"""
The label values in the imdb dataframe need to casted to ints
"""
imdb[1] = imdb[1].map(int)

## 1.c Split training and testing set

### Amazon

In [11]:
amazonPositiveTrain = amazon[amazon[1] == 1].head(400)
amazonPositiveTest = amazon[amazon[1] == 1].head(100)
amazonNegativeTrain = amazon[amazon[1] == 0].head(400)
amazonNegativeTest = amazon[amazon[1] == 0].head(100)

### Yelp

In [12]:
yelpPositiveTrain = yelp[yelp[1] == 1].head(400)
yelpPositiveTest = yelp[yelp[1] == 1].head(100)
yelpNegativeTrain = yelp[yelp[1] == 0].head(400)
yelpNegativeTest = yelp[yelp[1] == 0].head(100)

### Imdb

In [13]:
imdbPositiveTrain = imdb[imdb[1] == 1].head(400)
imdbPositiveTest = imdb[imdb[1] == 1].head(100)
imdbNegativeTrain = imdb[imdb[1] == 0].head(400)
imdbNegativeTest = imdb[imdb[1] == 0].head(100)

In [14]:
trainFrames = [amazonPositiveTrain,amazonNegativeTrain,
               yelpPositiveTrain,yelpNegativeTrain,
               imdbPositiveTrain,imdbNegativeTrain]

testFrames = [amazonPositiveTest,amazonNegativeTest,
              yelpPositiveTest,yelpNegativeTest,
              imdbPositiveTest,imdbNegativeTest]

trainDF = pd.concat(trainFrames).reset_index().drop("index",axis=1)
testDF =  pd.concat(testFrames).reset_index().drop("index",axis=1)

In [15]:
print trainDF.shape
print testDF.shape

(2400, 2)
(600, 2)


## 1.d Bag of Words model

In [16]:
def generateBag(reviews):
    """
    words is a list of words. The function creates a dictionary of unique words
    """
    wordBag ={}
    for review in reviews:
        for word in review:
            if word not in wordBag.keys():
                wordBag[word] = 0
    return wordBag

def indexBag(bag):
    """
    bag is the input. The function essentially assigns an index for each word in the bag.The index will be needed
    for counting the frequency of its word and create the correct feature vector.
    """
    idx = 0
    for word in bag.keys():
        bag[word] = idx
        idx+=1
    return bag

def wordFrequency(reviews,bag):
    """
    iterates through a review and using the bag, it returns the feature vectors of all the reviews.
    """
    featureVectorList = []
    for review in reviews:
        featureVec = [0.0 for uniqueWord in range(0,len(bag.keys()))]
        for word in review:
            if word in bag.keys():
                #update the value in featureVec whose index is found in bag[word]
                featureVec[bag[word]]+=1.0
            else:
                print "Error: " + word + " is not in the bag."
        featureVectorList.append(featureVec)
    return pd.DataFrame(featureVectorList,columns = bag.keys())

In [17]:
#generate bag only from Training Data
bagOfWords = generateBag(trainDF[0].tolist())
indexedBag = indexBag(bagOfWords)

In [18]:
#generate all the feature vectors for Training and Test Data
trainingVecs = wordFrequency(trainDF[0].tolist(),indexedBag)
testVecs = wordFrequency(testDF[0].tolist(),indexedBag)

In [19]:
#Report feature vectors any two reviews
print trainingVecs.head(2)

   limited  versatile  secondly  magnetic  personally  bear  yellow  sleek  \
0        0          0         0         0           0     0       0      0   
1        0          0         0         0           0     0       0      0   

   four  sleep   ...     jewel  forgetting  portion  pandering  compete  \
0     0      0   ...         0           0        0          0        0   
1     0      0   ...         0           0        0          0        0   

   monstrous  searched  yell  abhor  jawbone  
0          0         0     0      0        0  
1          0         0     0      0        1  

[2 rows x 4075 columns]


## 1.e Postprocessing Strategy

In [20]:
### L1 or lasso was chosen as the normalization method because it best suits sparse data.
def normalize(df):
    normalizedDf = preprocessing.normalize(df, norm='l1')
    return pd.DataFrame(normalizedDf)

In [21]:
normalizedTraining = normalize(trainingVecs)
normalizedTest = normalize(testVecs)

## 1.f Training set clustering (K-means implementation)

In [22]:
def getFirstMeans(trainData,k):
    firstMeans = []
    idxs = [x for x in range(len(trainData))]
    meanIdxs = random.sample(idxs,k)
    for idx in meanIdxs:
        firstMeans.append(trainData.iloc[idx].values)
    return firstMeans

def clustering(trainData,clusterMeans,k):
    rowIdxs = [i for i in range(len(trainData))]
    clusters = []
    for num in range(k):
        clusters.append([])
    
    for row in rowIdxs :
        distances = []
        for mean in clusterMeans:
            dist = np.linalg.norm(trainData.iloc[row] - mean)
            distances.append(dist)
        clusterIdx = np.argmin(distances)
        
        clusters[clusterIdx].append(row)
    return clusters
    
def newMeans(trainData,clusters):
    means = []
    for cluster in clusters:
        clusterMean = np.mean(trainData.iloc[cluster])
        means.append(clusterMean)
    return means

def KMeans(trainData,k):
    iters = 0
    initMeans = getFirstMeans(trainData,k)
    clusters1 = clustering(trainData,initMeans,k)
    means1 =  newMeans(trainData,clusters1)
    clusters2 = clustering(trainData,means1,k)
    means2 = newMeans(trainData,clusters2)
    while clusters1 != clusters2:
        print "iteration: " + str(iters+1)
        clusters1 = clustering(trainData,means2,k)
        means1 = newMeans(trainData,clusters1)
        clusters2 = clustering(trainData,means1,k)
        means2 = newMeans(trainData,clusters2)
        iters += 1
    
    return clusters2,means2

    
    
    

In [23]:
k = 2
kMeansCluster,kMeansClusterMean= KMeans(normalizedTraining,k)

2 2


In [24]:
print len(kMeansCluster)
print kMeansClusterMean

2
[0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
4045    0
4046    0
4047    0
4048    0
4049    0
4050    0
4051    0
4052    0
4053    0
4054    0
4055    0
4056    0
4057    0
4058    0
4059    0
4060    0
4061    0
4062    0
4063    0
4064    0
4065    0
4066    0
4067    0
4068    0
4069    0
4070    0
4071    0
4072    0
4073    0
4074    0
dtype: float64, 0       0.000178
1       0.000042
2       0.000052
3       0.000060
4       0.000077
5       0.000030
6       0.000070
7       0.000073
8       0.000133
9       0.000084
10      0.000719
11      0.000581
12      0.000419
13      0.000060
14      0.000038
15      0.000030
16      0.000120
17      0.000115
18      0.000047
19      0.000022
20      0.000038
21      0.0005

### Report Results

In [25]:
def displayResults(dfTrain,clusters):
    dfTrain[1] = dfTrain[1].map(int)
    for i,cluster in enumerate(clusters):
        clusterData = dfTrain.iloc[cluster]
        clusterData[1] = clusterData[1].map(int)
        print
        print "Number of points in cluster " + str(i) + " :"  + str(len(cluster))
        print "Number of 1s in cluster " + str(i) + " :" + str(clusterData[clusterData[1] == 1].shape[0])
        print "percent of 1s in  cluster " + str(i) + " :" + str((clusterData[clusterData[1] == 1].shape[0])/float(len(cluster))) 
        print "Number of 0s in cluster " + str(i) + " :" + str(clusterData[clusterData[1] == 0].shape[0])
        print "percent of 0s in cluster " +  str(i) + " :" + str((clusterData[clusterData[1] == 0].shape[0])/float(len(cluster)))

In [26]:
displayResults(trainDF,kMeansCluster)


Number of points in cluster 0 :11
Number of 1s in cluster 0 :3
percent of 1s in  cluster 0 :0.272727272727
Number of 0s in cluster 0 :8
percent of 0s in cluster 0 :0.727272727273

Number of points in cluster 1 :2389
Number of 1s in cluster 1 :1197
percent of 1s in  cluster 1 :0.501046462955
Number of 0s in cluster 1 :1192
percent of 0s in cluster 1 :0.498953537045


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## 1.g Sentiment prediction

In [27]:
def logisticRegression(trainVecs,trainLabels,testVecs,testLabels):
    model = linear_model.LogisticRegression()
    modelFit = model.fit(trainVecs, trainLabels)
    score = modelFit.score(testVecs, testLabels)
    pred = modelFit.predict(testVecs)
    confusionMatrix = confusion_matrix(pred, testLabels)
    W = model.coef_
    return score,confusionMatrix,W

In [28]:
score,confusionMatrix,weightVec = logisticRegression(normalizedTraining,trainDF[1],normalizedTest,testDF[1])

In [29]:
print score
print confusionMatrix

0.843333333333
[[270  64]
 [ 30 236]]


In [30]:
def getInfluentialWords(W,vecList):
    wordVector = vecList.loc[0].index
    maxNegIdx = W.argmin()
    maxPosIdx = W.argmax()
    negWord = wordVector[maxNegIdx] 
    posWord = wordVector[maxPosIdx]
    print "The word that most impacts negative reviews is: " + str(negWord)
    print "The word that most impacts positive reviews is: " + str(posWord)
    
    

In [31]:
getInfluentialWords(weightVec,trainingVecs)

The word that most impacts negative reviews is: bad
The word that most impacts positive reviews is: great


## 1.h N-gram model

In [32]:
def generateBiGramDict(reviews):
    """
    words is a list of words. The function creates a dictionary of unique words
    """
    biGramDict ={}
    for review in reviews:
        for i in range(len(review)-1):
            biGram= review[i] + " " + review[i+1]
            #biGram = str(review)
            if biGram not in  biGramDict.keys():
                biGramDict[biGram] = 0
    return biGramDict

def indexBiGram(d):
    """
    bag is the input. The function essentially assigns an index for each word in the bag.The index will be needed
    for counting the frequency of its word and create the correct feature vector.
    """
    idx = 0
    for biGram in d.keys():
        d[biGram] = idx
        idx+=1
    return d

def biGramFrequency(reviews,d):
    """
    iterates through a review and using the bag, it returns the feature vector of the review.
    """
    featureVectorList = []
    for review in reviews:
        featureVec = [0.0 for x in range(0,len(d.keys()))]
        for i in range(len(review)-1):
            biGram= review[i] + " " + review[i+1]
            if biGram in d.keys():
                #update the value in featureVec whose index is found in bag[word]
                featureVec[d[biGram]]+=1.0
            else:
                print "Error: " + biGram + " not in dictionary."
        featureVectorList.append(featureVec)
    return pd.DataFrame(featureVectorList,columns = d.keys())

In [33]:
#generate bag only from Training Data
biGramDict = generateBiGramDict(trainDF[0].tolist())
indexedBiGram = indexBiGram(biGramDict)

In [34]:
#generate all the feature vectors for Training and Test Data
biGramTrainingVecs = biGramFrequency(trainDF[0].tolist(),indexedBiGram)
biGramTestVecs = biGramFrequency(testDF[0].tolist(),indexedBiGram)

In [35]:
#Report feature vectors any two reviews
print biGramTrainingVecs.head(2)

   tension medical  give chill  clock docking  scratched protective  \
0                0           0              0                     0   
1                0           0              0                     0   

   problem others  expression feeling  except cole  played period  \
0               0                   0            0              0   
1               0                   0            0              0   

   friend enjoy  eaten multiple     ...      lucy bell  phone mp3  slow take  \
0             0               0     ...              0          0          0   
1             0               0     ...              0          0          0   

   day week  must shakespear  touching character  barely lukewarm  headset pc  \
0         0                0                   0                0           0   
1         0                0                   0                0           0   

   later returned  prime time  
0               0           0  
1               0           0

### Run Logistic Regression on the normalized train and test vectors obtained from the Bi Gram model

In [36]:
normalizedBiGramTrainingVecs = normalize(biGramTrainingVecs)
normalizedBiGramTestVecs = normalize(biGramTestVecs)

In [37]:
biGramScore,biGramConfusionMatrix,biGramWeightVec = logisticRegression(normalizedBiGramTrainingVecs,trainDF[1],
                                                       normalizedBiGramTestVecs,testDF[1])

In [38]:
print biGramScore
print biGramConfusionMatrix

0.976666666667
[[300  14]
 [  0 286]]


In [39]:
getInfluentialWords(biGramWeightVec,biGramTrainingVecs)

The word that most impacts negative reviews is: waste time
The word that most impacts positive reviews is: work great


## 1.i PCA for bag of words model.

### PCA Algorithm

1. Center the data by subtracting the mean from it.
2. Calculate the covariance matrix.
3. Calculate the eigenvectors of the covariance matrix.

In [40]:
def center(x,mean):
    for row in x:
        row -= mean
    return x

In [41]:
trainingMean = normalizedTraining.mean()
testMean = normalizedTest.mean()

In [42]:
centeredTraining = center(normalizedTraining,trainingMean)
centeredTest = center(normalizedTest,testMean)

In [43]:
#compute SVD to get the eigenvectors and the eigenvalues
UTrain,DTrain, VTrain = scipy.linalg.svd(centeredTraining, full_matrices=False)
UTest, DTest, VTest = scipy.linalg.svd(centeredTest, full_matrices=False)

In [44]:
print UTrain.shape,DTrain.shape, VTrain.shape
print UTest.shape, DTest.shape, VTest.shape

(2400, 2400) (2400,) (2400, 4075)
(600, 600) (600,) (600, 4075)


In [45]:
def PCA(U,D,V,x):
    """
    takes the SVD decomposition matrices and returns a lower dimensional representation of the data
    """
    size = D.shape[0]
    newD = np.zeros((size,size))
    newDiagonal = np.concatenate((D[:x],[0] * (len(D) - x)))
    i  = 0
    j = 0
    while i < size and j < size:
        newD[i][j] = newDiagonal[i]
        i+=1
        j+=1
    return pd.DataFrame(np.dot(U, np.dot(newD,V)))


In [46]:
train10 = PCA(UTrain,DTrain,VTrain,10)
train50 = PCA(UTrain,DTrain,VTrain,50)
train100 = PCA(UTrain,DTrain,VTrain,100)

test10 = PCA(UTest,DTest,VTest,10)
test50 = PCA(UTest,DTest,VTest,50)
test100 = PCA(UTest,DTest,VTest,10)

In [47]:
normalizedTrain10 = normalize(train10)
normalizedTest10 = normalize(test10)

normalizedTrain50 = normalize(train50)
normalizedTest50 = normalize(test50)

normalizedTrain100 = normalize(train100)
normalizedTest100 = normalize(test100)

In [48]:
clusterNum = 2
kMeansCluster10,kMeansClusterMean10= KMeans(normalizedTrain10,clusterNum)
kMeansCluster50,kMeansClusterMean50= KMeans(normalizedTrain50,clusterNum)
kMeansCluster100,kMeansClusterMean100= KMeans(normalizedTrain100,clusterNum)

2 2
iteration: 1
2 2
iteration: 2
2 2
iteration: 3
2 2
iteration: 4
2 2
2 2
iteration: 1
2 2
iteration: 2
2 2
2 2
iteration: 1
2 2
iteration: 2
2 2
iteration: 3
2 2
iteration: 4
2 2
iteration: 5
2 2
iteration: 6
2 2


In [49]:
displayResults(trainDF,kMeansCluster10)


Number of points in cluster 0 :168
Number of 1s in cluster 0 :149
percent of 1s in  cluster 0 :0.886904761905
Number of 0s in cluster 0 :19
percent of 0s in cluster 0 :0.113095238095

Number of points in cluster 1 :2232
Number of 1s in cluster 1 :1051
percent of 1s in  cluster 1 :0.470878136201
Number of 0s in cluster 1 :1181
percent of 0s in cluster 1 :0.529121863799


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [50]:
displayResults(trainDF,kMeansCluster50)


Number of points in cluster 0 :104
Number of 1s in cluster 0 :62
percent of 1s in  cluster 0 :0.596153846154
Number of 0s in cluster 0 :42
percent of 0s in cluster 0 :0.403846153846

Number of points in cluster 1 :2296
Number of 1s in cluster 1 :1138
percent of 1s in  cluster 1 :0.495644599303
Number of 0s in cluster 1 :1158
percent of 0s in cluster 1 :0.504355400697


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [51]:
displayResults(trainDF,kMeansCluster100)


Number of points in cluster 0 :85
Number of 1s in cluster 0 :45
percent of 1s in  cluster 0 :0.529411764706
Number of 0s in cluster 0 :40
percent of 0s in cluster 0 :0.470588235294

Number of points in cluster 1 :2315
Number of 1s in cluster 1 :1155
percent of 1s in  cluster 1 :0.498920086393
Number of 0s in cluster 1 :1160
percent of 0s in cluster 1 :0.501079913607


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [53]:
score10,confusionMatrix10,weightVec10 = logisticRegression(normalizedTrain10,trainDF[1],normalizedTest10,testDF[1])
score50,confusionMatrix50,weightVec50 = logisticRegression(normalizedTrain50,trainDF[1],normalizedTest50,testDF[1])
score100,confusionMatrix100,weightVec100 = logisticRegression(normalizedTrain100,trainDF[1],normalizedTest100,testDF[1])

In [54]:
print score10,score50,score100

0.608333333333 0.7 0.596666666667


In [55]:
print confusionMatrix10
print confusionMatrix50
print confusionMatrix100

[[209 144]
 [ 91 156]]
[[242 122]
 [ 58 178]]
[[185 127]
 [115 173]]
