In [53]:
import pandas as pd
# if you want to download the original file:
#df = pd.read_csv('https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/50k_imdb_movie_reviews.csv')
# otherwise load local file
df = pd.read_csv('shuffled_movie_data.csv')
df.tail()

Unnamed: 0,review,sentiment
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0
49999,I waited long to watch this movie. Also becaus...,1


# Definition function to process text:

In [62]:
# Return a lower case proccesed text
def processtext(texto):
    import re
    REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\n)")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    texto = REPLACE_NO_SPACE.sub('', texto.lower())
    texto = REPLACE_WITH_SPACE.sub(' ', texto)
    return texto

## Get the number of pronouns in a text:

In [48]:
# Get the number of 1st and 2nd pronouns:
def numberpronouns(texto):
    import re
    PRONOUNS = [r'\bi\b',r'\bwe\b',r'\byou\b',r'\bme\b',r'\bmyself\b',r'\byourself\b',r'\bll\b']
    L = [re.findall(pro, texto) for pro in PRONOUNS]
    lengh = 0
    for l in L:
        lengh = lengh + len(l)
    return float(lengh)

## Is if No, not, neither word is contained

In [47]:
def isNoThere(texto):
    import re
    NO_VARS = [r'\bno\b',r'\bnot\b',r'\bneither\b']
    L_NO = [re.findall(neg, texto) for neg in NO_VARS]
    le = 0
    for ll in L_NO:
        le = le+len(ll)
    if(le > 0):
        return float(1.0)
    else:
        return float(0.0)

## Is ( ! ) character contained in the text?

In [46]:
def isExclamationThere(texto):
    import re
    EX_VAR = r'!'
    L_EXC = re.findall(EX_VAR, texto)
    if(len(L_EXC) > 0):
        return float(1.0)
    else:
        return float(0.0)

## Get the logarithm of number of words

In [6]:
def getLnNumberWords(texto_dividido):
    import numpy as np
    N_WORDS = len(texto_dividido)
    return np.log(N_WORDS)

## Get Number of Positive and negative words

In [45]:
def getPositiveNegativeCountWords(texto_dividido, posneg_dictionary):
        # Count the positive words
    COUNT_POSITIVE = 0
    COUNT_NEGATIVE = 0
    for word in texto_dividido:
        try:
            val = posneg_dictionary[word]
            if val == 1:
                COUNT_POSITIVE = COUNT_POSITIVE + 1
            elif val == 0:
                COUNT_NEGATIVE = COUNT_NEGATIVE + 1

        except KeyError:
            pass
    
    return (float(COUNT_POSITIVE), float(COUNT_NEGATIVE))

## Chargue full dictionary:

In [8]:
def chargeDictionariPosNeg():
    posneg_dictionary = {}
    # Charge positive file of words to dictionary
    with open('goodbad/positive-words.txt','r') as fichero:
        import re
        for lin in fichero:
            if('a+' in lin):
                break
        for lin in fichero:
            if(' ' not in lin):
                posneg_dictionary[re.sub('\n','',lin)] = 1
                
    # Charge negative file of words to dictionary
    with open('goodbad/negative-words.txt','r') as fichero:
        import re
        for lin in fichero:
            if('2-faces' in lin):
                break
        for lin in fichero:
            if(' ' not in lin):
                posneg_dictionary[re.sub('\n','',lin)] = 0
    
    return posneg_dictionary

# Extract full features

In [10]:
dictionary = chargeDictionariPosNeg()

In [11]:
def extract_features(texto, dictionary):
    import re
    texto = processtext(texto)
    texto_dividido = re.split(' ', texto)
    x1, x2 = getPositiveNegativeCountWords(texto_dividido, dictionary)
    x3 = isNoThere(texto)
    x4 = numberpronouns(texto)
    x5 = isExclamationThere(texto)
    x6 = getLnNumberWords(texto_dividido)
    return {'x1':x1, 'x2':x2, 'x3':x3, 'x4':x4, 'x5':x5, 'x6':x6}

In [50]:
texto = """In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70's, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich family used their influence to cover the murder for more than twenty years. However, a snoopy detective and convicted perjurer in disgrace was able to disclose how the hideous crime was committed. The screenplay shows the investigation of Mark and the last days of Martha in parallel, but there is a lack of the emotion in the dramatization. My vote is seven.<br /><br />Title (Brazil): Not Available
"""

In [51]:
M = extract_features(texto, dictionary)

In [52]:
M

{'x1': 8.0,
 'x2': 14.0,
 'x3': 1.0,
 'x4': 0.0,
 'x5': 0.0,
 'x6': 5.472270673671475}

In [37]:
#import pandas as pd
#df = pd.DataFrame(columns=['x1','x2','x3','x4','x5','x6'])
#df.index.name = 'Id'
#df = df.append(M, ignore_index = True)
#df

In [36]:
#for index, row in df.iterrows():
#    print(row['x1'],row['x2'])

In [66]:
data = pd.DataFrame(columns=['x1','x2','x3','x4','x5','x6'])

for index, row in df.iterrows():
    texto = row['review']
    sentiment = row['sentiment']
    features_text = extract_features(texto, dictionary)
    features_text['sentiment'] = sentiment
    data = data.append(features_text, ignore_index=True)
data.index.name = 'Id'

In [39]:
aaa = {'b' : 1}

In [67]:
data

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,sentiment
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,8.0,14.0,1.0,0.0,0.0,5.472271,1.0
1,13.0,10.0,0.0,9.0,1.0,5.442418,0.0
2,14.0,21.0,1.0,6.0,1.0,5.680173,0.0
3,4.0,0.0,0.0,7.0,0.0,4.394449,1.0
4,4.0,2.0,0.0,3.0,0.0,4.762174,0.0
5,9.0,3.0,1.0,6.0,0.0,5.056246,1.0
6,15.0,4.0,1.0,5.0,0.0,5.666427,1.0
7,4.0,4.0,1.0,1.0,1.0,4.812184,1.0
8,6.0,3.0,0.0,3.0,1.0,4.762174,1.0
9,9.0,4.0,0.0,0.0,1.0,4.852030,1.0


In [68]:
data.to_csv('6features.csv')

# Validación del modelo logístico

In [69]:
Y = data['sentiment'].copy()

In [70]:
X = data.drop(['sentiment'], axis=1, inplace=True)

In [73]:
X = data.copy()

In [74]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [75]:
from sklearn import linear_model

In [76]:
lm = linear_model.LogisticRegression()
lm.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [77]:
probs = lm.predict_proba(X_test)

In [81]:
probs

array([[0.16344222, 0.83655778],
       [0.74322754, 0.25677246],
       [0.14207552, 0.85792448],
       ...,
       [0.37129934, 0.62870066],
       [0.08725982, 0.91274018],
       [0.6544095 , 0.3455905 ]])

In [82]:
prediction = lm.predict(X_test)

In [84]:
prediction

array([1., 0., 1., ..., 1., 1., 0.])

In [85]:
from IPython.display import display, Math, Latex

In [86]:
display(Math(r'Y_p = \begin{cases}0& si\ p\leq0.5\\1&si\ p >0.5\end{cases}'))

<IPython.core.display.Math object>

In [89]:
display(Math(r'\varepsilon\in (0,1), Y_p = \begin{cases}0& si\ p\leq\varepsilon \\1&si\ p >\varepsilon \end{cases}'))

<IPython.core.display.Math object>

In [88]:
probs

array([[0.16344222, 0.83655778],
       [0.74322754, 0.25677246],
       [0.14207552, 0.85792448],
       ...,
       [0.37129934, 0.62870066],
       [0.08725982, 0.91274018],
       [0.6544095 , 0.3455905 ]])

In [98]:
import numpy as np
prob = probs[:,1]
prob_df = pd.DataFrame(prob)
threshold = 0.4
prob_df['prediction'] = np.where(prob_df[0] > threshold, 1, 0)
prob_df.head()

Unnamed: 0,0,prediction
0,0.836558,1
1,0.256772,0
2,0.857924,1
3,0.848039,1
4,0.213856,0


In [100]:
pd.crosstab(prob_df.prediction, columns='count')

col_0,count
prediction,Unnamed: 1_level_1
0,5567
1,9433


In [101]:
from sklearn import metrics

In [102]:
metrics.accuracy_score(Y_test, prediction)

0.7300666666666666

# Testing my own implementation

In [170]:
import numpy as np
from numpy.linalg import inv
class MyLogisticRegression():
    def __init__(self, coef = None):
        self.coef = coef
    
    def logistic_prob(self, X, B):
        rows = np.shape(X)[0] # Numero de filas
        cols = np.shape(X)[1] # Número de columnas
        pi = list(range(1, rows + 1))
        exponent = list(range(1, rows +1 ))
        # Obtener las probabilidades:
        for i in range(rows):
            exponent[i] = 0
            # Obtener los exponentes, esto es por columnas:
            for j in range(cols):
                ex = X[i][j]*B[j]
                exponent[i] = exponent[i] + ex
            # End for exps
            with np.errstate(divide='ignore', invalid='ignore'):
                pi[i] = 1/(1 + np.exp(-exponent[i]))
        return pi

    def getW(self, P):
        n = len(P)
        W = np.zeros(n*n).reshape(n,n)
        for i in range(n):
            W[i,i] = P[i]*(1-P[i])
            W[i,i].astype(float)
        return W

    def fit(self, data, labels, err_allowed):
        X = data.values
        Y = labels.values
        rows = np.shape(X)[0]
        # Definición de la entrada bias, siempre es 1
        bias = np.ones(rows).reshape(rows, 1)
        # Add to the end of the array, Bias.
        __X = np.append(X, bias, axis = 1)
        cols = np.shape(__X)[1]
        # Inicializando beta como una matriz columna de ceros
        B = np.zeros(cols).reshape(cols, 1)
        # Primero se obtienen las probabilidades:
        ## range(1, t) itera desde 1 hasta t-1
        dB = np.array(range(1, cols + 1)).reshape(cols, 1)
        # Definir un error inicial
        current_error = 1000
        while current_error > err_allowed:
            # Obtener la matriz Pi
            Pi = []
            # Se obtiene una lista con todas las probabilidades
            Pi = self.logistic_prob(__X, B)
            # Obtener la matriz W:
            W = self.getW(Pi)
            den = inv(np.matmul(np.matmul(np.transpose(__X),W), __X))
            inter = (Y- np.transpose(Pi)).transpose()
            num = np.matmul(np.transpose(__X),(inter))
            dB = np.matmul(den, num)
            # Get the new Beta value
            B = B + dB
            current_error = np.sum(dB*dB)
            print('Current Error>', current_error)
            self.coef = B
        print('B>', B)
    
    def dotproduct(self, a, b):
        return sum(list(map(lambda x, y: x*y, a,b)))

    def sigmoid(self, val):
        return 1/(1 + np.exp(-val))

    def predict(self, X_test, threshold):
        if(np.shape(self.coef)[0] == 0 and np.shape(self.coef)[1] == 0):
            print('Error: Entrenar el modelo')
        else:
            X = X_test.values
            W = self.coef[:-1]
            b = self.coef[-1]
            estimated = np.zeros(np.shape(X)[0]).reshape(np.shape(X)[0], 1)
            for i in range(0, np.shape(X)[0]):
                xi = X[i,:]
                reg = self.dotproduct(xi,W) + b
                prob = self.sigmoid(float(reg))
                if prob >= threshold:
                    estimated[i,0] = 1
                else:
                    estimated[i,0] = 0
            return estimated

In [171]:
mymodel = MyLogisticRegression()

In [172]:
mymodel.fit(X_train, Y_train, 0.01)

Current Error> 0.2815552804489796
Current Error> 0.01561520605966815
Current Error> 0.0011599281053884586
B> [[ 0.20385232]
 [-0.17450669]
 [-0.48942494]
 [-0.03599915]
 [-0.02781703]
 [ 0.09708123]
 [-0.36845839]]


In [176]:
pr = mymodel.predict(X_test, 0.5)

In [177]:
pr

array([[1.],
       [0.],
       [1.],
       ...,
       [1.],
       [1.],
       [0.]])

# Get acuracy of my model

In [178]:
metrics.accuracy_score(Y_test, pr)

0.73