In [15]:
import pandas as pd
import numpy as np

In [16]:
from nltk import TweetTokenizer
from nltk.stem import PorterStemmer
import re

def preprocess_tweet(tweet):
    '''
    Preprocess the text in a single tweet
    arguments: tweet = a single tweet in form of string 
    '''
    #convert the tweet to lower case just for convinient purpose
    tweet = tweet.lower()
    #convert all urls to string "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    #convert all @username to "AT_USER"
    tweet = re.sub('@[^\s]+','', tweet)#tweet = re.sub('@[^\s]+','AT_USER', tweet)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    #convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet

def stemSentence(tweet):
    '''
    To stem words to their common roots form eg:crying,cries,cried-->cry
    '''
    # Call the stemmer
    porter = PorterStemmer()
    
    tokenizer = TweetTokenizer()
    token_tweet=tokenizer.tokenize(tweet)
    stem_sentence=[]
    for word in token_tweet:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(' ')
    return ''.join(stem_sentence)

In [17]:
holdout = pd.read_csv('/Users/faiz/FYPCoding/FYP/trainingandtestdata/testdata.csv',header=None)
holdout.columns = ['sentiment','id','date','flag','user','text']
#delete 3 columns: flags,id,user, as they are not required for analysis
holdout = holdout.drop(["id","user","date","flag"], axis = 1)
#in sentiment140 dataset, positive = 4, negative = 0; So we change positive to 1
holdout.sentiment = holdout.sentiment.replace(4,1)
holdout = holdout[holdout['sentiment'] != 2]


In [18]:
#Preprocess tweets
holdout['text'] = holdout['text'].apply(preprocess_tweet)

In [19]:
#replace field that contains whitespace only is converted to NAN
holdout.replace(r'^\s*$', np.nan , regex=True, inplace=True)

In [20]:
holdout.info()
#no missing data detected

<class 'pandas.core.frame.DataFrame'>
Int64Index: 359 entries, 0 to 497
Data columns (total 2 columns):
sentiment    359 non-null int64
text         359 non-null object
dtypes: int64(1), object(1)
memory usage: 8.4+ KB


In [21]:
holdout.dropna(inplace=True)
holdout.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 359 entries, 0 to 497
Data columns (total 2 columns):
sentiment    359 non-null int64
text         359 non-null object
dtypes: int64(1), object(1)
memory usage: 8.4+ KB


In [22]:
holdout['text'] = holdout['text'].apply(stemSentence)



In [23]:
print(holdout.text[1])
print(holdout.text[0])

read my kindl 2 ... love it ... lee child is good read . 
i loooooooovvvvvvee my kindl 2 . not that the dx is cool , but the 2 is fantast in it own right . 


In [24]:
import pickle
from sklearn.metrics import accuracy_score,confusion_matrix

# load the model from disk
loaded_model_cv = pickle.load(open('cvsent_model.sav', 'rb'))
loaded_model_tvc = pickle.load(open('tvcsent_model.sav', 'rb'))

cv_loaded = pickle.load(open('cv_fitted.sav', 'rb'))
tvc_loaded = pickle.load(open('tvc_fitted.sav', 'rb'))

In [32]:
# Transform the review column
cv_transformhold = cv_loaded.transform(holdout.text)

# Create the bow representation
# df_holdout=pd.DataFrame(cv_transformhold.toarray(), columns=cv_loaded.get_feature_names())
# print(df_holdout.head())


y_cv = holdout.sentiment
X_cv = cv_transformhold

y_predicted_cv = loaded_model_cv.predict(X_cv)

#print('Accuracy on holdout set: ', log_reg.score(X,y))
print('CountVectorizer Performance')
print('Accuracy score holdout set: ', accuracy_score(y_cv, y_predicted_cv))
print('Confusion matrix holdout set: \n', confusion_matrix(y_cv, y_predicted_cv)/len(y_cv))

result_cv = loaded_model_cv.score(X_cv, y_cv)
print(result_cv)
print()


# Transform the review column
tvc_transformhold = tvc_loaded.transform(holdout.text)

# Create the bow representation
# df_holdout=pd.DataFrame(tvc_transformhold.toarray(), columns=tvc_loaded.get_feature_names())
# print(df_holdout.head())


y_tvc = holdout.sentiment
X_tvc = tvc_transformhold

y_predicted_tvc = loaded_model_tvc.predict(X_tvc)

#print('Accuracy on holdout set: ', log_reg.score(X,y))
print('TF-IDF Performance')

print('Accuracy score holdout set: ', accuracy_score(y_tvc, y_predicted_tvc))
print('Confusion matrix holdout set: \n', confusion_matrix(y_tvc, y_predicted_tvc)/len(y_tvc))

result_tvc = loaded_model_tvc.score(X_tvc, y_tvc)
print(result_tvc)

CountVectorizer Performance
Accuracy score holdout set:  0.8217270194986073
Confusion matrix holdout set: 
 [[0.38997214 0.10306407]
 [0.07520891 0.43175487]]

TF-IDF Performance
Accuracy score holdout set:  0.8105849582172702
Confusion matrix holdout set: 
 [[0.38718663 0.10584958]
 [0.08356546 0.42339833]]


In [26]:
# Transform the review column
tvc_transformhold = tvc_loaded.transform(holdout.text)

# Create the bow representation
# df_holdout=pd.DataFrame(tvc_transformhold.toarray(), columns=tvc_loaded.get_feature_names())
# print(df_holdout.head())


y = holdout.sentiment
X = tvc_transformhold

y_predicted = loaded_model_tvc.predict(X)

#print('Accuracy on holdout set: ', log_reg.score(X,y))

print('Accuracy score holdout set: ', accuracy_score(y, y_predicted))
print('Confusion matrix holdout set: \n', confusion_matrix(y, y_predicted)/len(y))

result = loaded_model.score(X, y)
print(result)


Accuracy score holdout set:  0.8105849582172702
Confusion matrix holdout set: 
 [[0.38718663 0.10584958]
 [0.08356546 0.42339833]]
0.8105849582172702


In [77]:
stra = ["i like you"]
X = cv_loaded.transform(stra)
y_predicted = loaded_model.predict(X)
print(str(y_predicted).strip('[]'))


0
