In [1]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import time
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from torchvision import datasets, transforms, models
import re
import string
from ast import literal_eval

In [2]:
#nltk.download()

In [3]:
# Read csv file into pandas dataframe and name columns
data_dir = 'data/training.csv'
cols = ['sentiment', 'tweet']

train = pd.read_csv(data_dir, encoding = "ISO-8859-1", header=None, usecols=[0,5], names=cols)
train.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [4]:
#train = train[0:10000]
train.shape

(1600000, 2)

In [5]:
#Observe Data
print((train['sentiment'] == 0).sum())
print((train['sentiment'] == 2).sum())
print((train['sentiment'] == 4).sum())
print((train['sentiment'].unique()))

800000
0
800000
[0 4]


In [6]:
"""
Scale the Data so 1 represents positive sentiment and 0 negative sentiment
"""
train['sentiment'] = train['sentiment']/4

In [7]:
print(train.head())
print((train['sentiment'] == 0.0).sum())
print((train['sentiment'] == 1.0).sum())

   sentiment                                              tweet
0        0.0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1        0.0  is upset that he can't update his Facebook by ...
2        0.0  @Kenichan I dived many times for the ball. Man...
3        0.0    my whole body feels itchy and like its on fire 
4        0.0  @nationwideclass no, it's not behaving at all....
800000
800000


In [8]:
"""
Process the tweets to remove not very significant words/symbols
"""
stopWords = set(stopwords.words('english'))

def cleanText(tweet):
    # remove punctuation marks
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # remove "@" symbol
    remove_at = re.sub(r'@[A-Za-z0-9]+',"", tweet)
    # remove url's
    remove_url = re.sub('https?://[A-Za-z0-9./]+',"", remove_at)
    
    #remove stopwords
    text = word_tokenize(remove_url)
    clean = [word for word in text if word not in stopwords.words('english')]
    
    #lower case all letter
    clean = [word.lower() for word in clean]

    return clean

In [9]:
"""
Tokenize the data in batches to see how fast the cleaning takes
"""
batch_size = 10000
iters = train.shape[0]//batch_size
i = 0


for i in range(iters):
    batch = train[i*batch_size:i*batch_size+batch_size]
    batch['tweet'] = batch['tweet'].apply(cleanText)
    t = pd.concat([train, batch], 1)
    print("Loop: {}".format(i + 1))
    
t = t.iloc[:, :-2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Loop: 1
Loop: 2
Loop: 3
Loop: 4
Loop: 5
Loop: 6
Loop: 7
Loop: 8
Loop: 9
Loop: 10
Loop: 11
Loop: 12
Loop: 13
Loop: 14
Loop: 15
Loop: 16
Loop: 17
Loop: 18
Loop: 19
Loop: 20
Loop: 21
Loop: 22
Loop: 23
Loop: 24
Loop: 25
Loop: 26
Loop: 27
Loop: 28
Loop: 29
Loop: 30
Loop: 31
Loop: 32
Loop: 33
Loop: 34
Loop: 35
Loop: 36
Loop: 37
Loop: 38
Loop: 39
Loop: 40
Loop: 41
Loop: 42
Loop: 43
Loop: 44
Loop: 45
Loop: 46
Loop: 47
Loop: 48
Loop: 49
Loop: 50
Loop: 51
Loop: 52
Loop: 53
Loop: 54
Loop: 55
Loop: 56
Loop: 57
Loop: 58
Loop: 59
Loop: 60
Loop: 61
Loop: 62
Loop: 63
Loop: 64
Loop: 65
Loop: 66
Loop: 67
Loop: 68
Loop: 69
Loop: 70
Loop: 71
Loop: 72
Loop: 73
Loop: 74
Loop: 75
Loop: 76
Loop: 77
Loop: 78
Loop: 79
Loop: 80
Loop: 81
Loop: 82
Loop: 83
Loop: 84
Loop: 85
Loop: 86
Loop: 87
Loop: 88
Loop: 89
Loop: 90
Loop: 91
Loop: 92
Loop: 93
Loop: 94
Loop: 95
Loop: 96
Loop: 97
Loop: 98
Loop: 99
Loop: 100
Loop: 101
Loop: 102
Loop: 103
Loop: 104
Loop: 105
Loop: 106
Loop: 107
Loop: 108
Loop: 109
Loop: 110
Loop: 11

In [10]:
t.head()

Unnamed: 0,sentiment,tweet
0,0.0,"[switchfoot, httptwitpiccom2y1zl, awww, thats,..."
1,0.0,"[upset, cant, update, facebook, texting, might..."
2,0.0,"[kenichan, i, dived, many, times, ball, manage..."
3,0.0,"[whole, body, feels, itchy, like, fire]"
4,0.0,"[nationwideclass, behaving, im, mad, i, cant, ..."


In [5]:
"""
One Hot Encode the Tweets so an array of 1, 0's represent presence of word in tweet
Will only one-hot encode each tweet as it is inputted in the training model in order to save memory
"""
def oneHot_encode(one_hot, tweet_input):
    for word in tweet_input['tweet']:
        one_hot[word] = 1
    return one_hot

In [15]:
"""
The dataframe 'one_hot' will represent a dataframe filled with zeros for each possibly word/symbol in the tweets
Will then be used when one hot encoding each tweet while training
Not enough memory to one hot encode all tweets at once
"""


words = set()

for index,row in t.iterrows():
    for word in row['tweet']:
        words.add(word)
        
zero_data = np.zeros(shape=(1,len(words)))
one_hot = pd.DataFrame(zero_data,columns = words)


In [16]:
zero_data

array([[0., 0., 0., ..., 0., 0., 0.]])

In [17]:
one_hot

Unnamed: 0,theremuslupins,osquot,ñññ,lindslou10,cutiiex3,tourthen,stalkeees,thamelissashow,houstonlast,sarahshe,...,httptwitpiccom6io3t,httptwitpiccom4j1fs,agopentecost,hrrmph,stephie,humelation,raeraesunshine,emmyhall,chloekatee,cavaliersso
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
"""
save tokenized data in csv
"""
tokenized_training = t
tokenized_training.to_csv('tokenized_tweets.csv')

In [18]:
"""
save one hot encoded array in a csv
"""
encoded = one_hot
encoded.to_csv('encode.csv')

In [4]:
"""
load the tokenized tweets
"""
csv = 'tokenized_tweets.csv'
clean_train = pd.read_csv(csv,index_col=0)
clean_train['tweet'] = clean_train['tweet'].apply(literal_eval)
clean_train.head()

  mask |= (ar1 == a)


Unnamed: 0,sentiment,tweet
0,0.0,"[switchfoot, httptwitpiccom2y1zl, awww, thats,..."
1,0.0,"[upset, cant, update, facebook, texting, might..."
2,0.0,"[kenichan, i, dived, many, times, ball, manage..."
3,0.0,"[whole, body, feels, itchy, like, fire]"
4,0.0,"[nationwideclass, behaving, im, mad, i, cant, ..."


In [3]:
"""
load the one hot encoded array
"""
csv = 'encode.csv'
encoded = pd.read_csv(csv,index_col=0)
encoded

Unnamed: 0,theremuslupins,osquot,ñññ,lindslou10,cutiiex3,tourthen,stalkeees,thamelissashow,houstonlast,sarahshe,...,httptwitpiccom6io3t,httptwitpiccom4j1fs,agopentecost,hrrmph,stephie,humelation,raeraesunshine,emmyhall,chloekatee,cavaliersso
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
"""
load weights array
"""
csv = 'weights_1_epoch.csv'
w = pd.read_csv(csv,index_col=0)
w

Unnamed: 0,0
0,-0.000193
1,-0.000530
2,0.000721
3,0.000135
4,-0.000299
5,0.000457
6,-0.001350
7,0.001373
8,-0.000736
9,-0.000253


In [5]:
encoded['i']

0    0.0
Name: i, dtype: float64

In [6]:
clean_train.shape

(1600000, 2)

In [5]:
positives = clean_train[:200]
negatives = clean_train[850000:850200]
print(positives.shape)
print(negatives.shape)
print((positives['sentiment'] == 0).sum())
print((negatives['sentiment'] == 1).sum())

(200, 2)
(200, 2)
200
200


In [38]:
all_clean_train = clean_train

In [7]:
c = pd.concat([positives, negatives])
c.shape
clean_train = c

In [8]:
encoded.shape

(1, 850276)

In [6]:
"""
Split data into training, test, and validation sets
Will one-hot encode later to save memory
"""

data = clean_train

#Training Date = 60%

train=data.sample(frac=0.8,random_state=100)
train.head()

#Validation Data = 10%
rest = data.loc[~data.index.isin(train.index), :]
validation = rest.sample(frac=.25, random_state=100)
validation.head()

#Test Data = 30%
ultimate_rest = rest.loc[~rest.index.isin(validation.index), :]
test = ultimate_rest

print(train.shape[1])

print("Training Data:{}".format(train.shape))
print("Training Neg:{}".format((train['sentiment'] == 0.0).sum()))
print("Training Pos:{}".format((train['sentiment'] == 1.0).sum()))

print("Validation Data:{}".format(validation.shape))
print("Validation Neg:{}".format((validation['sentiment'] == 0.0).sum()))
print("Validation Pos:{}".format((validation['sentiment'] == 1.0).sum()))

print("Test Data:{}".format(test.shape))
print("Test Neg:{}".format((test['sentiment'] == 0.0).sum()))
print("Test Pos:{}".format((test['sentiment'] == 1.0).sum()))

print(train.head())

2
Training Data:(1280000, 2)
Training Neg:640333
Training Pos:639667
Validation Data:(80000, 2)
Validation Neg:39957
Validation Pos:40043
Test Data:(240000, 2)
Test Neg:119710
Test Pos:120290
         sentiment                                              tweet
883393         1.0         [nicholasmw, 1, day, u, find, girl, worry]
461479         0.0  [nothing, tv, im, desperate, entertained, im, ...
103630         0.0  [very, excited, greys, tonight, not, happy, se...
1031265        1.0            [2pms, great, song, nichkhun, hwaiting]
219956         0.0                                  [my, teeth, hurt]


In [7]:
v_ = validation[:500]
print("Validation Data:{}".format(v_.shape))
print("Validation Neg:{}".format((v_['sentiment'] == 0.0).sum()))
print("Validation Pos:{}".format((v_['sentiment'] == 1.0).sum()))

Validation Data:(500, 2)
Validation Neg:251
Validation Pos:249


In [8]:
"""
Define the functions used to calculate probability and gradient descent using logistic regression
"""

def sigmoid(x):
    return (1/(1 + np.exp(-x)))

def probability(tweet, weights):
    w = np.transpose(weights)
    return sigmoid(np.dot(tweet, weights))
    
def cost_derivative(features, targets, prob, weights, penalty):
    error = prob-targets
    err = float(error)
    b = penalty*weights
    feat = np.transpose(features)
    a = err*feat
    aa = np.array(a)
    bb = np.reshape(b, (b.shape[0], 1))
    result = aa + bb
    return result
    
def update_weights(weights, lr, error):
    prod = lr * error
    p = np.array(prod)
    ww = np.reshape(weights, (weights.shape[0], 1))
    www = np.array(ww)
    w = www - p
    return w

In [9]:
"""
create tensor to hold encoded tweets in batch size of 100
"""
ten = pd.concat([encoded]*100)
featurestensor = torch.tensor(ten.values.astype(np.int32))
print(featurestensor.size())

torch.Size([100, 850276])


In [10]:
t = train['sentiment']
t = pd.DataFrame(t)
a = np.array(t)
traintensor = torch.tensor(a.astype(np.int32))
traintensor.shape

torch.Size([1280000, 1])

In [11]:
train_target = torch.tensor(train['sentiment'].values.astype(np.int32))
train_index = torch.tensor(train.index.values.astype(np.int32)) 

In [12]:
train_tensor = torch.utils.data.TensorDataset(train_index, train_target) 
train_loader = torch.utils.data.DataLoader(dataset = train_tensor, batch_size = 100, shuffle = True)

In [51]:
"""
validation function will allow us to see how our model is predicting sentiments while it trains
"""
def valid(weights, validation):
    
    correct = 0
    total = validation.shape[0]
    
    for i in range(validation.shape[0]):
        row = validation.iloc[i]
        target = row['sentiment']
        zero_encode = encoded.copy()
        features = oneHot_encode(zero_encode, row)
        
        if weights.shape[0] != features.shape[1]:
            w = np.transpose(weights)
        else:
            w = weights
                
        score = sigmoid(np.dot(features, w))
        
        pred = 0
        if (score > 0.5):
            pred = 1
        if (pred == target):
            correct += 1
            
    return 100*(correct/total)

In [52]:
"""
function that trains the training dataset using logistic regression and gradient descent
"""

def train_model(trainloader, featurestensor, epochs, learnrate, penalty, words):
    start_time = time.time()
    steps = 0
    e = 0
    print_every = 50
    batch_size = 100

    n_tweets, n_features = featurestensor.shape
    
    weights = np.random.normal(scale = 1/n_features**.5, size = n_features)

    while e < epochs:
        running_loss = 0
        val_loss = 0
        
        train_sample = trainloader.sample(frac=1).reset_index(drop=True)
        

        for i in range(n_tweets):
            
            original = words.copy()
            
            row = train_sample.iloc[i]
            
            features = oneHot_encode(original, row)

            steps += 1

            target = row['sentiment']
            
            if weights.shape[0] != features.shape[1]:
                w = np.transpose(weights)
            else:
                w = weights
            
            probability = sigmoid(np.dot(features, w))
            error = cost_derivative(features, target, probability, weights, penalty)
            
            if steps % print_every == 0:
                print("Epoch:{}/{} ---------- Validation: {}%".format(e+1, epochs, np.round(valid(w, v_), 2)))

        weights = update_weights(weights, learnrate, error)   
        
        # next epoch
        e += 1

    #if e % (epochs / 2) == 0:
            
        #print("Epoch: {} --------- Validation: {}".format(e+1, valid(weights)))
        #print("Epoch: {}".format(e + 1))
        #print("=========")
    end_time = time.time()
    print("Finished training!")
    print("Time Elapsed: {}".format(np.round(end_time - start_time, 2)))
    
    return weights

In [None]:
weights = train_model(clean_train, featurestensor, 4, 0.001, 0.07, encoded)

In [33]:
"""
save weights array in a csv
"""
weights_csv = pd.DataFrame(weights)
weights_csv.to_csv('weights_4_epoch.csv')

In [38]:
def acc(weights, test, encode):
    correct = 0
    total = test.shape[0]
    
    for i in range(test.shape[0]):
        row = test.iloc[i]
        target = row['sentiment']
        zero_encode = encode.copy()
        features = oneHot_encode(zero_encode, row)
        
        if weights.shape[0] != features.shape[1]:
            w = np.transpose(weights)
        else:
            w = weights
                
        score = sigmoid(np.dot(features, w))
        
        pred = 0
        if (score > 0.5):
            pred = 1
        if (pred == target):
            correct += 1
        
        if i%100 == 0:
            print("Current Example: {} ------ Accuracy: {}".format(i, 100*(correct/(i+1))))
    
    print('Final Accuracy: {0}'.format((correct/total)*100))

In [46]:
neg = test[500:1000]
pos = test[220000:220500]

#neg = test[479900:]

In [47]:
pos.head()

Unnamed: 0,sentiment,tweet
1465247,1.0,"[itssb, loolgud, afternoonhow, u, doin]"
1465248,1.0,"[love, kitty, funny, watching, try, catch, fly]"
1465269,1.0,"[watching, french, open, final, chijmes, annie..."
1465274,1.0,"[kapsali, thanks, i, meant, euwide, results, t..."
1465275,1.0,"[strawberries, â¥, yummy, grandma, brought, 4..."


In [56]:
neg_pos = pd.concat([pos, neg])
#neg_pos.shape

In [58]:
testing = neg_pos.sample(frac=1)

In [59]:
testing.head()

Unnamed: 0,sentiment,tweet
1467599,1.0,"[back, behind, dm, 2000, today, boys, girls, i..."
1466191,1.0,"[via, theleadguy, hey, there, great, meet, my,..."
4114,0.0,"[ivyclark, name, ironic, first, used, 10204, s..."
4717,0.0,"[starting, short, lunch, break, must, short, p..."
6369,0.0,"[hot, outside, ice, cream, time, whats, catch,..."


In [61]:
t = neg_pos[neg_pos.index == 4114]
t

Unnamed: 0,sentiment,tweet
4114,0.0,"[ivyclark, name, ironic, first, used, 10204, s..."


In [62]:
acc(w, testing, encoded)

Current Example: 0 ------ Accuracy: 100.0
Current Example: 100 ------ Accuracy: 51.48514851485149
Current Example: 200 ------ Accuracy: 51.741293532338304
Current Example: 300 ------ Accuracy: 50.83056478405316
Current Example: 400 ------ Accuracy: 50.374064837905244
Current Example: 500 ------ Accuracy: 51.49700598802395
Current Example: 600 ------ Accuracy: 51.08153078202995
Current Example: 700 ------ Accuracy: 51.06990014265336
Current Example: 800 ------ Accuracy: 51.186017478152316
Current Example: 900 ------ Accuracy: 50.943396226415096
Final Accuracy: 51.5
