## Imports

In [77]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re
from sklearn import preprocessing
import random
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
import scipy
from sklearn import decomposition
import operator

## 1a. Download and  load Data

In [78]:
#File Locations
amazonFileName = "sentiment labelled sentences/amazon_cells_labelled.txt"
yelpFileName = "sentiment labelled sentences/yelp_labelled.txt"
imdbFileName = "sentiment labelled sentences/imdb_labelled.txt"

In [79]:
#Special case - imdb, both tabs and newlines need to be taken care of
def cleanFile(fname):
    f = open(fname,"r")
    cleanOutput=[]
    for line in f:
        line = line.replace("\n","")
        cleanOutput.append(line.split("\t"))
    return cleanOutput

In [80]:
#Import Data into Pandas
amazon = pd.read_csv(amazonFileName, sep="\t",header=None).dropna().reset_index().drop("index",axis =1)
yelp = pd.read_csv(yelpFileName, sep="\t", header=None,encoding="utf-8").dropna().reset_index().drop("index", axis=1)

cleanImdb = cleanFile(imdbFileName)
imdb = pd.DataFrame(cleanImdb)

In [81]:
def getLabelRatio(df):
    df[1] = df[1].map(int)
    reviews = df[df[1] == 0]
    zeroRatio = reviews.shape[0]/ float(df.shape[0])
    print "The file contains " + str(zeroRatio) + "% of 0s and " + str(1 - zeroRatio) + "% of 1s."

In [82]:
getLabelRatio(amazon)
getLabelRatio(yelp)
getLabelRatio(imdb)

The file contains 0.5% of 0s and 0.5% of 1s.
The file contains 0.5% of 0s and 0.5% of 1s.
The file contains 0.5% of 0s and 0.5% of 1s.


In [83]:
print imdb

                                                     0  1
0    A very, very, very slow-moving, aimless movie ...  0
1    Not sure who was more lost - the flat characte...  0
2    Attempting artiness with black & white and cle...  0
3         Very little music or anything to speak of.    0
4    The best scene in the movie was when Gerardo i...  1
5    The rest of the movie lacks art, charm, meanin...  0
6                                  Wasted two hours.    0
7    Saw the movie today and thought it was a good ...  1
8                                 A bit predictable.    0
9    Loved the casting of Jimmy Buffet as the scien...  1
10                And those baby owls were adorable.    1
11   The movie showed a lot of Florida at it's best...  1
12   The Songs Were The Best And The Muppets Were S...  1
13                                   It Was So Cool.    1
14   This is a very "right on case" movie that deli...  1
15   It had some average acting from the main perso...  0
16   This revi

## 1.b Preprocessing Strategy

In [84]:
def lowerCase(x):
    """
    x is a string. Every letter in x is turned to lowercase
    """
    return x.lower()

def lemmatize(lmt,x):
    """
    It takes a word x and the lemmatizer lmt from nltk. Lmt lemmatizes x
    """
    return lmt.lemmatize(x)

def stripPunctuation(s):
    """
    It takes a string x and uses regular expression library to remove punctuation from it.
    Reference: https://www.quora.com/How-do-I-remove-punctuation-from-a-Python-string
    """
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', s)


def getStopWords():
    # Do all stop words need to be excluded. Should we keep negatives like no or not? - Francesco 10/29.
    return set(stopwords.words("english")) - {"not","no"}

def preProcessing(x):
    processedString = []
    lemmatizer = WordNetLemmatizer()
    stopWords = getStopWords()
    xLowerCase = lowerCase(x)
    xWithoutPunct = stripPunctuation(xLowerCase)
    
    for word in xWithoutPunct.split():
        try:
            if unicode(word) not in stopWords:
                lemmatizedWord = lemmatize(lemmatizer, unicode(word))
                processedString.append(str(lemmatizedWord))
        except (UnicodeDecodeError, UnicodeEncodeError):
            print "UnicodeDecodeError: " + word
       
    return processedString    

In [85]:
#Apply the preprocessing method on the sentences in the amazon,yelp and imdb dataframes.

amazon[0] = amazon[0].map(preProcessing)
yelp[0] = yelp[0].map(preProcessing)
imdb[0] = imdb[0].map(preProcessing)

UnicodeDecodeError: fiancé
UnicodeDecodeError: café
UnicodeDecodeError: crêpe
UnicodeDecodeError: puréed
UnicodeDecodeError: 
UnicodeDecodeError: québec
UnicodeDecodeError: iswas
UnicodeDecodeError: 
UnicodeDecodeError: clichés
UnicodeDecodeError: clichés
UnicodeDecodeError: aurvåg
UnicodeDecodeError: 
UnicodeDecodeError: problemsthe
UnicodeDecodeError: 
UnicodeDecodeError: clichés
UnicodeDecodeError: 
UnicodeDecodeError: seeing


In [86]:
#The label values in the imdb dataframe need to casted to ints

imdb[1] = imdb[1].map(int)

## 1.c Split training and testing set

### Amazon

In [87]:
amazonPositiveTrain = amazon[amazon[1] == 1].head(400)
amazonPositiveTest = amazon[amazon[1] == 1].tail(100)
amazonNegativeTrain = amazon[amazon[1] == 0].head(400)
amazonNegativeTest = amazon[amazon[1] == 0].tail(100)

### Yelp

In [88]:
yelpPositiveTrain = yelp[yelp[1] == 1].head(400)
yelpPositiveTest = yelp[yelp[1] == 1].tail(100)
yelpNegativeTrain = yelp[yelp[1] == 0].head(400)
yelpNegativeTest = yelp[yelp[1] == 0].tail(100)

### Imdb

In [89]:
imdbPositiveTrain = imdb[imdb[1] == 1].head(400)
imdbPositiveTest = imdb[imdb[1] == 1].tail(100)
imdbNegativeTrain = imdb[imdb[1] == 0].head(400)
imdbNegativeTest = imdb[imdb[1] == 0].tail(100)

In [90]:
trainFrames = [amazonPositiveTrain,amazonNegativeTrain,
               yelpPositiveTrain,yelpNegativeTrain,
               imdbPositiveTrain,imdbNegativeTrain]

testFrames = [amazonPositiveTest,amazonNegativeTest,
              yelpPositiveTest,yelpNegativeTest,
              imdbPositiveTest,imdbNegativeTest]

trainDF = pd.concat(trainFrames).reset_index().drop("index",axis=1)
testDF =  pd.concat(testFrames).reset_index().drop("index",axis=1)

In [91]:
#Check the the dimensions of the train and test dataframes.
print trainDF.shape
print testDF.shape

(2400, 2)
(600, 2)


## 1.d Bag of Words model

In [92]:
print trainDF.head(3)

                                0  1
0  [good, case, excellent, value]  1
1                [great, jawbone]  1
2                    [mic, great]  1


In [93]:
def generateBag(reviews):
    """
    words is a list of words. The function creates a dictionary of unique words
    """
    wordBag ={}
    for review in reviews:
        for word in review:
            if word not in wordBag.keys():
                wordBag[word] = 0
    return wordBag

def indexBag(bag):
    """
    bag is the input. The function essentially assigns an index for each word in the bag.The index will be needed
    for counting the frequency of its word and create the correct feature vector.
    """
    idx = 0
    for word in bag.keys():
        bag[word] = idx
        idx+=1
    return bag

def wordFrequency(reviews,bag):
    """
    iterates through a review and using the bag, it returns the feature vectors of all the reviews.
    """
    featureVectorList = []
    for review in reviews:
        featureVec = [0.0 for uniqueWord in range(0,len(bag.keys()))]
        for word in review:
            if word in bag.keys():
                #update the value in featureVec whose index is found in bag[word]
                featureVec[bag[word]]+=1.0
            else:
                print "Error: " + word + " is not in the bag."
        featureVectorList.append(featureVec)
    return pd.DataFrame(featureVectorList,columns = bag.keys())

In [94]:
#generate bag only from Training Data
bagOfWords = generateBag(trainDF[0].tolist())
indexedBag = indexBag(bagOfWords)

In [95]:
#generate all the feature vectors for Training and Test Data
trainingVecs = wordFrequency(trainDF[0].tolist(),indexedBag)
testVecs = wordFrequency(testDF[0].tolist(),indexedBag)

Error: entertainment is not in the bag.
Error: activesync is not in the bag.
Error: optimal is not in the bag.
Error: synchronization is not in the bag.
Error: ps3 is not in the bag.
Error: cheapy is not in the bag.
Error: shiny is not in the bag.
Error: exceeds is not in the bag.
Error: sight is not in the bag.
Error: answer is not in the bag.
Error: laptop is not in the bag.
Error: ir is not in the bag.
Error: laptop is not in the bag.
Error: cancellation is not in the bag.
Error: travled is not in the bag.
Error: swivel is not in the bag.
Error: dual is not in the bag.
Error: keeping is not in the bag.
Error: maintains is not in the bag.
Error: flawless is not in the bag.
Error: normal is not in the bag.
Error: land is not in the bag.
Error: owning is not in the bag.
Error: official is not in the bag.
Error: oem is not in the bag.
Error: loudest is not in the bag.
Error: competitor is not in the bag.
Error: saved is not in the bag.
Error: alot is not in the bag.
Error: leopard is no

In [96]:
#Report feature vectors any two reviews
print trainingVecs.head(2)

   limited  versatile  secondly  magnetic  personally  bear  yellow  sleek  \
0        0          0         0         0           0     0       0      0   
1        0          0         0         0           0     0       0      0   

   four  sleep   ...     jewel  forgetting  portion  pandering  compete  \
0     0      0   ...         0           0        0          0        0   
1     0      0   ...         0           0        0          0        0   

   monstrous  searched  yell  abhor  jawbone  
0          0         0     0      0        0  
1          0         0     0      0        1  

[2 rows x 4077 columns]


## 1.e Postprocessing Strategy

In [97]:
### L1 or lasso was chosen as the normalization method because it best suits sparse data.
def normalize(df):
    normalizedDf = preprocessing.normalize(df, norm='l1')
    return pd.DataFrame(normalizedDf)

In [98]:
normalizedTraining = normalize(trainingVecs)
normalizedTest = normalize(testVecs)

## 1.f Training set clustering (K-means implementation)

In [99]:
def getFirstMeans(trainData,k):
    firstMeans = []
    idxs = [x for x in range(len(trainData))]
    meanIdxs = random.sample(idxs,k)
    for idx in meanIdxs:
        firstMeans.append(trainData.iloc[idx].values)
    return firstMeans

def clustering(trainData,clusterMeans,k):
    rowIdxs = [i for i in range(len(trainData))]
    clusters = []
    for num in range(k):
        clusters.append([])
    
    for row in rowIdxs :
        distances = []
        for mean in clusterMeans:
            dist = np.linalg.norm(trainData.iloc[row] - mean)
            distances.append(dist)
        clusterIdx = np.argmin(distances)
        
        clusters[clusterIdx].append(row)
    return clusters
    
def newMeans(trainData,clusters):
    means = []
    for cluster in clusters:
        clusterMean = np.mean(trainData.iloc[cluster])
        means.append(clusterMean)
    return means

def KMeans(trainData,k):
    iters = 0
    initMeans = getFirstMeans(trainData,k)
    clusters1 = clustering(trainData,initMeans,k)
    means1 =  newMeans(trainData,clusters1)
    clusters2 = clustering(trainData,means1,k)
    means2 = newMeans(trainData,clusters2)
    while clusters1 != clusters2:
        print "iteration: " + str(iters+1)
        clusters1 = clustering(trainData,means2,k)
        means1 = newMeans(trainData,clusters1)
        clusters2 = clustering(trainData,means1,k)
        means2 = newMeans(trainData,clusters2)
        iters += 1
    
    return clusters2,means2   

In [100]:
k = 2
kMeansCluster,kMeansClusterMean= KMeans(normalizedTraining,k)

iteration: 1
iteration: 2


In [101]:
#Rreporting the centers of k clusters.
print kMeansClusterMean

[0       0.000176
1       0.000042
2       0.000053
3       0.000053
4       0.000078
5       0.000030
6       0.000070
7       0.000073
8       0.000131
9       0.000085
10      0.000691
11      0.000587
12      0.000388
13      0.000060
14      0.000038
15      0.000030
16      0.000121
17      0.000116
18      0.000047
19      0.000022
20      0.000035
21      0.000420
22      0.000014
23      0.000211
24      0.002233
25      0.000042
26      0.000042
27      0.000070
28      0.000042
29      0.000361
          ...   
4047    0.000196
4048    0.000033
4049    0.000272
4050    0.000018
4051    0.000902
4052    0.000035
4053    0.001821
4054    0.000047
4055    0.000106
4056    0.000072
4057    0.000673
4058    0.000060
4059    0.000069
4060    0.000342
4061    0.000053
4062    0.000035
4063    0.000010
4064    0.000053
4065    0.000300
4066    0.000030
4067    0.000038
4068    0.000115
4069    0.000694
4070    0.000106
4071    0.000030
4072    0.000018
4073    0.000070
4074    0.000

### Report Results

In [102]:
def displayResults(dfTrain,clusters):
    dfTrain[1] = dfTrain[1].map(int)
    for i,cluster in enumerate(clusters):
        clusterData = dfTrain.iloc[cluster]
        clusterData[1] = clusterData[1].map(int)
        print
        print "Number of points in cluster " + str(i) + " :"  + str(len(cluster))
        print "Number of 1s in cluster " + str(i) + " :" + str(clusterData[clusterData[1] == 1].shape[0])
        print "percent of 1s in  cluster " + str(i) + " :" + str((clusterData[clusterData[1] == 1].shape[0])/float(len(cluster))) 
        print "Number of 0s in cluster " + str(i) + " :" + str(clusterData[clusterData[1] == 0].shape[0])
        print "percent of 0s in cluster " +  str(i) + " :" + str((clusterData[clusterData[1] == 0].shape[0])/float(len(cluster)))

In [103]:
displayResults(trainDF,kMeansCluster)


Number of points in cluster 0 :2365
Number of 1s in cluster 0 :1199
percent of 1s in  cluster 0 :0.506976744186
Number of 0s in cluster 0 :1166
percent of 0s in cluster 0 :0.493023255814

Number of points in cluster 1 :35
Number of 1s in cluster 1 :1
percent of 1s in  cluster 1 :0.0285714285714
Number of 0s in cluster 1 :34
percent of 0s in cluster 1 :0.971428571429


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## 1.g Sentiment prediction

In [104]:
def logisticRegression(trainVecs,trainLabels,testVecs,testLabels):
    model = linear_model.LogisticRegression()
    modelFit = model.fit(trainVecs, trainLabels)
    score = modelFit.score(testVecs, testLabels)
    pred = modelFit.predict(testVecs)
    confusionMatrix = confusion_matrix(pred, testLabels)
    W = model.coef_
    return score,confusionMatrix,W

In [105]:
score,confusionMatrix,weightVec = logisticRegression(normalizedTraining,trainDF[1],normalizedTest,testDF[1])

In [106]:
print score
print confusionMatrix

0.805
[[261  78]
 [ 39 222]]


In [107]:
def getInfluentialWords(W,vecList):
    wordVector = vecList.loc[0].index
    maxNegIdx = W.argmin()
    maxPosIdx = W.argmax()
    negWord = wordVector[maxNegIdx] 
    posWord = wordVector[maxPosIdx]
    print "The word that most impacts negative reviews is: " + str(negWord)
    print "The word that most impacts positive reviews is: " + str(posWord)
    
    

In [108]:
getInfluentialWords(weightVec,trainingVecs)

The word that most impacts negative reviews is: not
The word that most impacts positive reviews is: great


## 1.h N-gram model

In [109]:
def generateBiGramDict(reviews):
    """
    words is a list of words. The function creates a dictionary of unique words
    """
    biGramDict ={}
    for review in reviews:
        for i in range(len(review)-1):
            biGram= review[i] + " " + review[i+1]
            #biGram = str(review)
            if biGram not in  biGramDict.keys():
                biGramDict[biGram] = 0
    return biGramDict

def indexBiGram(d):
    """
    bag is the input. The function essentially assigns an index for each word in the bag.The index will be needed
    for counting the frequency of its word and create the correct feature vector.
    """
    idx = 0
    for biGram in d.keys():
        d[biGram] = idx
        idx+=1
    return d

def biGramFrequency(reviews,d):
    """
    iterates through a review and using the bag, it returns the feature vector of the review.
    """
    featureVectorList = []
    for review in reviews:
        featureVec = [0.0 for x in range(0,len(d.keys()))]
        for i in range(len(review)-1):
            biGram= review[i] + " " + review[i+1]
            if biGram in d.keys():
                #update the value in featureVec whose index is found in bag[word]
                featureVec[d[biGram]]+=1.0
            else:
                print "Error: " + biGram + " not in dictionary."
        featureVectorList.append(featureVec)
    return pd.DataFrame(featureVectorList,columns = d.keys())

In [110]:
#generate bag only from Training Data
biGramDict = generateBiGramDict(trainDF[0].tolist())
indexedBiGram = indexBiGram(biGramDict)

In [111]:
#generate all the feature vectors for Training and Test Data
biGramTrainingVecs = biGramFrequency(trainDF[0].tolist(),indexedBiGram)
biGramTestVecs = biGramFrequency(testDF[0].tolist(),indexedBiGram)

Error: use like not in dictionary.
Error: great tool not in dictionary.
Error: tool entertainment not in dictionary.
Error: entertainment communication not in dictionary.
Error: communication data not in dictionary.
Error: data management not in dictionary.
Error: management oh not in dictionary.
Error: oh sure not in dictionary.
Error: sure use not in dictionary.
Error: use activesync not in dictionary.
Error: activesync 4 not in dictionary.
Error: 4 2 not in dictionary.
Error: 2 optimal not in dictionary.
Error: optimal data not in dictionary.
Error: data synchronization not in dictionary.
Error: synchronization result not in dictionary.
Error: 2 case not in dictionary.
Error: case would not in dictionary.
Error: bought battery not in dictionary.
Error: battery coupon not in dictionary.
Error: coupon amazon not in dictionary.
Error: amazon happy not in dictionary.
Error: perfect ps3 not in dictionary.
Error: five star not in dictionary.
Error: star plus not in dictionary.
Error: plus

In [112]:
#Report feature vectors any two reviews
print biGramTrainingVecs.head(2)

   tension medical  give chill  clock docking  scratched protective  \
0                0           0              0                     0   
1                0           0              0                     0   

   problem others  expression feeling  except cole  played period  \
0               0                   0            0              0   
1               0                   0            0              0   

   not disappoint  friend enjoy     ...      lucy bell  phone mp3  slow take  \
0               0             0     ...              0          0          0   
1               0             0     ...              0          0          0   

   day week  must shakespear  touching character  barely lukewarm  headset pc  \
0         0                0                   0                0           0   
1         0                0                   0                0           0   

   later returned  prime time  
0               0           0  
1               0           0

### Run Logistic Regression on the normalized train and test vectors obtained from the Bi Gram model

In [113]:
normalizedBiGramTrainingVecs = normalize(biGramTrainingVecs)
normalizedBiGramTestVecs = normalize(biGramTestVecs)

In [114]:
biGramScore,biGramConfusionMatrix,biGramWeightVec = logisticRegression(normalizedBiGramTrainingVecs,trainDF[1],
                                                       normalizedBiGramTestVecs,testDF[1])

In [115]:
print biGramScore
print biGramConfusionMatrix

0.631666666667
[[268 189]
 [ 32 111]]


In [116]:
getInfluentialWords(biGramWeightVec,biGramTrainingVecs)

The word that most impacts negative reviews is: not good
The word that most impacts positive reviews is: work great


## 1.i PCA for bag of words model.

### PCA Algorithm

1. Center the data by subtracting the mean from it.
2. Calculate the covariance matrix.
3. Calculate the eigenvectors of the covariance matrix.

In [117]:
def center(x,mean):
    for row in x:
        row -= mean
    return x

In [118]:
trainingMean = normalizedTraining.mean()
testMean = normalizedTest.mean()

In [119]:

centeredTraining = center(normalizedTraining,trainingMean)
centeredTest = center(normalizedTest,testMean)

In [120]:
#compute SVD to get the eigenvectors and the eigenvalues
UTrain,DTrain, VTrain = scipy.linalg.svd(centeredTraining, full_matrices=False)
UTest, DTest, VTest = scipy.linalg.svd(centeredTest, full_matrices=False)

In [121]:
print UTrain.shape,DTrain.shape, VTrain.shape
print UTest.shape, DTest.shape, VTest.shape

(2400, 2400) (2400,) (2400, 4077)
(600, 600) (600,) (600, 4077)


In [122]:
def PCA(U,D,V,x):
    """
    takes the SVD decomposition matrices and returns a lower dimensional representation of the data
    """
    size = D.shape[0]
    newD = np.zeros((size,size))
    newDiagonal = np.concatenate((D[:x],[0] * (len(D) - x)))
    i  = 0
    j = 0
    while i < size and j < size:
        newD[i][j] = newDiagonal[i]
        i+=1
        j+=1
    return pd.DataFrame(np.dot(U, np.dot(newD,V)))


In [123]:
train10 = PCA(UTrain,DTrain,VTrain,10)
train50 = PCA(UTrain,DTrain,VTrain,50)
train100 = PCA(UTrain,DTrain,VTrain,100)

test10 = PCA(UTest,DTest,VTest,10)
test50 = PCA(UTest,DTest,VTest,50)
test100 = PCA(UTest,DTest,VTest,10)

In [124]:
normalizedTrain10 = normalize(train10)
normalizedTest10 = normalize(test10)

normalizedTrain50 = normalize(train50)
normalizedTest50 = normalize(test50)

normalizedTrain100 = normalize(train100)
normalizedTest100 = normalize(test100)

In [125]:
clusterNum = 2
kMeansCluster10,kMeansClusterMean10= KMeans(normalizedTrain10,clusterNum)
kMeansCluster50,kMeansClusterMean50= KMeans(normalizedTrain50,clusterNum)
kMeansCluster100,kMeansClusterMean100= KMeans(normalizedTrain100,clusterNum)

iteration: 1
iteration: 2
iteration: 3
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 1
iteration: 2


In [126]:
displayResults(trainDF,kMeansCluster10)


Number of points in cluster 0 :211
Number of 1s in cluster 0 :99
percent of 1s in  cluster 0 :0.469194312796
Number of 0s in cluster 0 :112
percent of 0s in cluster 0 :0.530805687204

Number of points in cluster 1 :2189
Number of 1s in cluster 1 :1101
percent of 1s in  cluster 1 :0.502969392417
Number of 0s in cluster 1 :1088
percent of 0s in cluster 1 :0.497030607583


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [127]:
displayResults(trainDF,kMeansCluster50)


Number of points in cluster 0 :2267
Number of 1s in cluster 0 :1073
percent of 1s in  cluster 0 :0.473312748125
Number of 0s in cluster 0 :1194
percent of 0s in cluster 0 :0.526687251875

Number of points in cluster 1 :133
Number of 1s in cluster 1 :127
percent of 1s in  cluster 1 :0.954887218045
Number of 0s in cluster 1 :6
percent of 0s in cluster 1 :0.0451127819549


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [128]:
displayResults(trainDF,kMeansCluster100)


Number of points in cluster 0 :2272
Number of 1s in cluster 0 :1140
percent of 1s in  cluster 0 :0.50176056338
Number of 0s in cluster 0 :1132
percent of 0s in cluster 0 :0.49823943662

Number of points in cluster 1 :128
Number of 1s in cluster 1 :60
percent of 1s in  cluster 1 :0.46875
Number of 0s in cluster 1 :68
percent of 0s in cluster 1 :0.53125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [129]:
score10,confusionMatrix10,weightVec10 = logisticRegression(normalizedTrain10,trainDF[1],normalizedTest10,testDF[1])
score50,confusionMatrix50,weightVec50 = logisticRegression(normalizedTrain50,trainDF[1],normalizedTest50,testDF[1])
score100,confusionMatrix100,weightVec100 = logisticRegression(normalizedTrain100,trainDF[1],normalizedTest100,testDF[1])

In [130]:
print score10,score50,score100

0.628333333333 0.656666666667 0.613333333333


In [131]:
print confusionMatrix10
print confusionMatrix50
print confusionMatrix100

[[193 116]
 [107 184]]
[[221 127]
 [ 79 173]]
[[183 115]
 [117 185]]
