In [2]:
import numpy as np;
import pandas as pd;
from scipy.optimize import fmin_bfgs;
import traceback;

class GenTransform:
    
    def __init__(self):
        return;
    
    @staticmethod
    def generateBooleanColumns(df, columns):
        for column in columns:
                df[column] = -1 * df[column].isnull() + 1
        return df;
    
    @staticmethod
    def generateValueColumns(df, columns):
        for column in columns:
            df = pd.concat([df.drop(column, axis=1), df[column].str.get_dummies()], axis = 1);
        return df;
    
    @staticmethod
    def removeNonFeatures(df, columns):
        for column in columns:
            df = df.drop(column, axis = 1);
        return df;
    
    @staticmethod
    def fillNaWithMean(df):
        return df.fillna(df.mean());
    
    @staticmethod
    def dataTransform(df, listRowstoColumns, listRowsToBoolean, listNonFeatures):
        df1 = GenTransform.generateBooleanColumns(df, listRowsToBoolean);
        df2 = GenTransform.removeNonFeatures(df, listNonFeatures);
        df3 = GenTransform.generateValueColumns(df2, listRowstoColumns);
        df4 = GenTransform.fillNaWithMean(df3);
        return(df4);


In [8]:
class LogisticRegression:
    
    def __init__(self, X, y, lam):
        self.X = X;
#         print("in constructor, length of received arguement: ", y.shape)
        self.y = y;
        self.lam = lam;
        self.m = X.shape[0];
        self.n = X.shape[1];
        self.y = np.reshape(self.y, (self.m,1)); # need to check why this is happening
        self.X = LogisticRegression.addIntercept(self.X); # to be checked for documentation for how class name used in constructor
        self.theta = self.initializeTheta();
#         print("constructor",self.y.shape);
        
    def initializeTheta(self):
        return np.zeros((self.n + 1, 1));
    
    @staticmethod
    def addIntercept(X):
        m = X.shape[0];
        X = np.hstack((np.ones((m,1)), X));
        return X;
        
    @staticmethod
    def sigmoid(z):
        return 1 / (1 + np.exp(-1 * z));

    def buildModel(self):
        
        fminOutput = fmin_bfgs(
                            self.costFunction,
                            self.theta,
                            self.gradFunction,
                            disp=True,
                            maxiter=400,
                            full_output = True,
                            retall=True
                        );
        opTheta = fminOutput[0];
        opTheta = opTheta.reshape((self.n + 1, 1));
        return opTheta, fminOutput
        
    
        
    def costFunction(self, theta):
        
        theta = np.reshape(theta,(self.n + 1, 1));
        
        z = np.dot(self.X, theta);
        h = LogisticRegression.sigmoid(z);
#         print("cost function before",self.y.shape);
        J = np.add(
                np.multiply(
                    (1/self.m) 
                    , np.sum( 
                            np.subtract(
                                np.multiply(
                                    np.multiply(-1, self.y),
                                    np.log(h)
                                )
                                , np.multiply(
                                    np.subtract(1, self.y), 
                                    np.log(np.subtract(1, h))
                                )
                            )
                        )
                   )  
                , np.multiply(
                    (self.lam/self.m) 
                    , np.sum(
                        np.square(
                            np.vstack(
                                [0, theta[1:]]
                            )
                        )
                    ) 
                )
        );
#         print("cost function after",self.y.shape);
        return J;
    
    def gradFunction(self, theta):
        
#         print("grad function before",self.y.shape);
        theta = np.reshape(theta,(self.n + 1, 1));
        
#         print("grad function after reshaping",self.y.shape);
        z = np.dot(self.X, theta);
        h = LogisticRegression.sigmoid(z);
        
        grad = ( 
                    np.multiply(
                        (1/self.m) 
                        , (
                           np.dot(
                               np.transpose(self.X)
                                ,np.subtract(h, self.y)
                           )
                        )
                    )
                ) + \
                ( 
                    np.multiply(
                        (self.lam/self.m)
                        , np.vstack(
                            [0, theta[1:]]
                        )
                    )
                );
#         print("grad function after function formula",self.y.shape);
        grad = np.asarray(grad).reshape((self.n + 1,));
#         print("grad function after function formula and reshaping",self.y.shape);
        return grad;
    
    
# sampleX = np.matrix('1 2; 3 4; 5 6; 7 8');
# sampleY = np.matrix('1;0;0;1')
# lgTest = LogisticRegression(sampleX, sampleY, 1);
# try:
#     opTheta, fminOutput = lgTest.buildModel();
# except(e):
#     print(e);
# LogisticRegression.sigmoid(LogisticRegression.addIntercept(sampleX) * opTheta)


listRowsToColumns=["Sex", "Embarked"];
listRowsToBoolean=["Cabin"];
listNonFeatures = ["Name", "Ticket"];


# trainingDf= pd.read_csv("C:/Dinesh/kaggle/titanic/train.csv", index_col=0, parse_dates=True, nrows = 20);
trainingDf= pd.read_csv("D:/datasets/titanic-kaggle/train.csv", index_col=0, parse_dates=True);
trainingDf = GenTransform.dataTransform(trainingDf, listRowsToColumns, listRowsToBoolean, listNonFeatures);   


trainingX = np.matrix(trainingDf.ix[:, trainingDf.columns != 'Survived'].values);
trainingY = np.transpose(np.matrix(trainingDf["Survived"].values));
print("before constructor",trainingY.shape, len(trainingY));
# try:
maxAcc = 0;
for lam in range(0,120,3):
    lrIns = LogisticRegression(trainingX, trainingY, float(lam)/10.0);
    theta, fminOutput = lrIns.buildModel();
    predictedY = np.round(LogisticRegression.sigmoid(np.dot(LogisticRegression.addIntercept(trainingX),theta)));
    currAcc = np.count_nonzero(trainingY == predictedY)
    if(currAcc>maxAcc):
        opLam = lam;
        opTheta = theta;
        maxAcc = currAcc;
print(maxAcc, float(opLam)/10.0);

testDf= pd.read_csv("D:/datasets/titanic-kaggle/test.csv", index_col=0, parse_dates=True);
testDf = GenTransform.dataTransform(testDf, listRowsToColumns, listRowsToBoolean, listNonFeatures); 
testX = np.matrix(testDf.ix[:, testDf.columns != 'Survived'].values);

predictedY = np.rint(LogisticRegression.sigmoid(np.dot(LogisticRegression.addIntercept(testX),opTheta)));
# currAcc = np.count_nonzero(trainingY == predictedY)
np.savetxt('D:/datasets/titanic-kaggle/predictedY.csv', predictedY.astype(int), fmt='%i', delimiter=",");
print("done with writing output");
    
# except:
#     traceback.print_exc();
#     pass;



before constructor (891, 1) 891
Optimization terminated successfully.
         Current function value: 0.436913
         Iterations: 80
         Function evaluations: 87
         Gradient evaluations: 87




         Current function value: 0.439723
         Iterations: 37
         Function evaluations: 148
         Gradient evaluations: 136
         Current function value: 0.441963
         Iterations: 37
         Function evaluations: 132
         Gradient evaluations: 120
         Current function value: 0.444051
         Iterations: 38
         Function evaluations: 88
         Gradient evaluations: 77




         Current function value: 0.446009
         Iterations: 37
         Function evaluations: 102
         Gradient evaluations: 91
         Current function value: 0.447872
         Iterations: 37
         Function evaluations: 142
         Gradient evaluations: 130




         Current function value: 0.449613
         Iterations: 37
         Function evaluations: 141
         Gradient evaluations: 129
         Current function value: 0.451280
         Iterations: 37
         Function evaluations: 133
         Gradient evaluations: 121




         Current function value: 0.452852
         Iterations: 38
         Function evaluations: 108
         Gradient evaluations: 98
         Current function value: 0.454325
         Iterations: 38
         Function evaluations: 145
         Gradient evaluations: 133




         Current function value: 0.452922
         Iterations: 59
         Function evaluations: 142
         Gradient evaluations: 130
         Current function value: 0.457102
         Iterations: 38
         Function evaluations: 133
         Gradient evaluations: 121




         Current function value: 0.458351
         Iterations: 40
         Function evaluations: 146
         Gradient evaluations: 134
         Current function value: 0.459556
         Iterations: 40
         Function evaluations: 124
         Gradient evaluations: 112




         Current function value: 0.458430
         Iterations: 57
         Function evaluations: 151
         Gradient evaluations: 139
         Current function value: 0.459762
         Iterations: 56
         Function evaluations: 154
         Gradient evaluations: 142




         Current function value: 0.462990
         Iterations: 41
         Function evaluations: 145
         Gradient evaluations: 133
         Current function value: 0.464236
         Iterations: 39
         Function evaluations: 135
         Gradient evaluations: 123




         Current function value: 0.463602
         Iterations: 55
         Function evaluations: 175
         Gradient evaluations: 163
         Current function value: 0.466333
         Iterations: 39
         Function evaluations: 99
         Gradient evaluations: 89




         Current function value: 0.466069
         Iterations: 56
         Function evaluations: 175
         Gradient evaluations: 163
         Current function value: 0.468398
         Iterations: 40
         Function evaluations: 154
         Gradient evaluations: 142
         Current function value: 0.469395
         Iterations: 40
         Function evaluations: 107
         Gradient evaluations: 96




         Current function value: 0.470303
         Iterations: 43
         Function evaluations: 133
         Gradient evaluations: 121
         Current function value: 0.471304
         Iterations: 43
         Function evaluations: 138
         Gradient evaluations: 126




         Current function value: 0.472413
         Iterations: 43
         Function evaluations: 104
         Gradient evaluations: 92
         Current function value: 0.473499
         Iterations: 40
         Function evaluations: 160
         Gradient evaluations: 148




         Current function value: 0.474386
         Iterations: 43
         Function evaluations: 103
         Gradient evaluations: 91
         Current function value: 0.475353
         Iterations: 42
         Function evaluations: 145
         Gradient evaluations: 133
         Current function value: 0.476356
         Iterations: 42
         Function evaluations: 157
         Gradient evaluations: 145




         Current function value: 0.486252
         Iterations: 14
         Function evaluations: 75
         Gradient evaluations: 63
         Current function value: 0.478193
         Iterations: 47
         Function evaluations: 173
         Gradient evaluations: 161




         Current function value: 0.479162
         Iterations: 47
         Function evaluations: 129
         Gradient evaluations: 118
         Current function value: 0.489034
         Iterations: 16
         Function evaluations: 68
         Gradient evaluations: 56
         Current function value: 0.481111
         Iterations: 49
         Function evaluations: 168
         Gradient evaluations: 156




         Current function value: 0.482078
         Iterations: 48
         Function evaluations: 175
         Gradient evaluations: 163
         Current function value: 0.483016
         Iterations: 47
         Function evaluations: 153
         Gradient evaluations: 141
         Current function value: 0.486600
         Iterations: 34
         Function evaluations: 119
         Gradient evaluations: 108




         Current function value: 0.484939
         Iterations: 50
         Function evaluations: 187
         Gradient evaluations: 175
         Current function value: 0.494627
         Iterations: 16
         Function evaluations: 83
         Gradient evaluations: 71
718 9.9
done with writing output




In [227]:
import re
reviewsDf= pd.read_csv("D:/Codes/dhana_sister_fyp/review-sentiment-analysis/sentiment analyzer/reviews_sister.csv", index_col=0, parse_dates=True, nrows= 10);

In [228]:
wordFreq = dict();
stopWords = ["a","about","above","after","again","against","all","am","an","and","any","are","as","at",
"be","because","been","before","being","below","between","both","but","by","could","did",
"do","does","doing","down","during","each","few","for","from","further","had","has","have",
"having","he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself",
"his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","it","it's","its",
"itself","let's","me","more","most","my","myself","of","off","on","once","only","or","other",
"ought","our","ours","ourselves","out","over","own","same","she","she'd","she'll","she's",
"should","so","some","such","than","that","that's","the","their","theirs","them","themselves",
"then","there","there's","these","they","they'd","they'll","they're","they've","this","those",
"through","to","too","under","until","up","very","was","we","we'd","we'll","we're","we've",
"were","what","what's","when","when's","where","where's","which","while","who","who's","whom",
"why","why's","with","would","you","you'd","you'll","you're","you've","your","yours","yourself",
"yourselves"];
reviews = reviewsDf.values[:,1];
words = [];
for review in reviews:
    review = re.sub('[^0-9a-zA-Z ]+', '', review);
    reviewWords = review.lower().strip().split(" ");
    for word in reviewWords:
        if(not word in stopWords):
            if(not word in wordFreq):
                wordFreq[word] = 1;
                words.append(word);
                
            else:
                wordFreq[word] += 1;
wordFreq.pop('', None);
if '' in words: words.remove('');
wordFreq = sorted(wordFreq.items(), key=lambda x:x[1], reverse = True);
count = 0;
for key in wordFreq:
    if key in words: words.remove(key);
    count = count + 1;
    if(count==100):
        break;

In [229]:
for word in words:
    reviewsDf["is_" + word] = reviewsDf["Review"].str.contains(word) * 1
reviewsXColumns = [col for col in reviewsDf.columns if col not in ['positive', 'negative', 'neutral', 'ProductName', 'Review']]
reviewsX = np.matrix(reviewsDf[reviewsXColumns]);
sentiments = ["positive", "negative", "neutral"];
predictionsSentiment = np.ones((len(reviewsDf.index),1)) * -1;
i =0;
for sentiment in sentiments:
    reviewsy = np.transpose(np.matrix(reviewsDf[sentiment]));
    reviewLrIns = LogisticRegression(reviewsX, reviewsy, 0.5);
    theta, fminOutput = reviewLrIns.buildModel();
    predictedY = np.round(LogisticRegression.sigmoid(np.dot(LogisticRegression.addIntercept(reviewsX),theta)));
    predictionsSentiment = np.hstack([predictionsSentiment, predictedY]);
    i = i+1;
sentimentIndices = np.subtract(np.argmax(predictionsSentiment, axis = 1), 1).flatten().tolist()[0]
predictedSentiments = [];
for i in sentimentIndices:
    predictedSentiments.append(sentiments[i]);
finalPredictedSentiments = pd.DataFrame(np.transpose(np.matrix(predictedSentiments)));
finalPredictedSentiments.to_csv("D:/Codes/dhana_sister_fyp/review-sentiment-analysis/sentiment analyzer/finalPredictedSentiments.csv",
                                sep=',');



KeyError: 'positive'