### CBNA Summer School 2019 -- Online Social Networks and Media <br> Python Lab 2

Author: Zhen Zhu <br> __http://twitter.com/zhenzhunet__

In [None]:
import numpy as np
import pandas as pd
import re #regular expression

In [None]:
# define a function cleaning text with re
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)        
    return input_txt

1. Data preprocessing

In [None]:
# import the twitter data
myFilePath = 'Airline-Sentiment.csv'
myData = pd.read_csv(myFilePath,encoding='latin1')

In [None]:
# check the first 10 rows
myData.head(10)

In [None]:
# get positive tweets
myDataPos = myData[myData.airline_sentiment=='positive']

In [None]:
# get negative tweets
myDataNeg = myData[myData.airline_sentiment=='negative']

In [None]:
# get a random sample of positive tweets
# random_state is used for replication
myDataPosSample = myDataPos.sample(n=1000,random_state=19)

In [None]:
# take a look at the first 10 entries of the positive
list(myDataPosSample.text[:10])

In [None]:
# get a random sample of negative tweets
# random_state is used for replication
myDataNegSample = myDataNeg.sample(n=1000,random_state=19)

In [None]:
# take a look at the first 10 entries of the negative
list(myDataNegSample.text[:10])

In [None]:
# combine the two to form the data set for sentiment analysis
mySample = myDataPosSample.append(myDataNegSample,ignore_index=True)

In [None]:
# remove handles
mySample['tidy_text'] = np.vectorize(remove_pattern)(mySample['text'], "@[\w]*")

In [None]:
# compare tidy_text with text
mySample.head(10)

In [None]:
# lowercase the letters 
mySample['tidy_text'] = mySample['tidy_text'].str.lower()

In [None]:
# compare tidy_text with text
mySample.head(10)

In [None]:
# replace special characters with space
mySample['tidy_text'] = mySample['tidy_text'].str.replace("[^a-zA-Z#]", " ")

In [None]:
# compare tidy_text with text
mySample.head(10)

In [None]:
# remove short words
mySample['tidy_text'] = mySample['tidy_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [None]:
# compare tidy_text with text
mySample.head(10)

In [None]:
# randomize the entries
mySample = mySample.sample(frac=1,random_state=19)

In [None]:
# check the randomization
mySample.head(10)

In [None]:
# add a column with target values 0 or 1
mySample['airline_sentiment_label'] = mySample['airline_sentiment'].apply(lambda x: 0 if x=='negative' else 1)

In [None]:
# check the final sample
mySample.head(10)

2. Predictive modelling

In [None]:
# split the sample to training and test sets
train = mySample.iloc[:1600,:]
test = mySample.iloc[1600:,:]

2.1 Bag-of-words approach

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# prepare a vectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# bag-of-words feature matrix
x_bow = bow_vectorizer.fit(train['tidy_text']) 
# generate the training set feature matrix
x_train = x_bow.transform(train['tidy_text'])
# generate the target vector
y_train = train['airline_sentiment_label']

In [None]:
# get the 1000 features of the bag of words of the corpus of training set
feature_names = x_bow.get_feature_names()

In [None]:
# check the number of features
len(feature_names)

In [None]:
# check the first 50 features
print(feature_names[:50])

In [None]:
# check again by skipping every 20 features
print(feature_names[::20])

In [None]:
# cross validate the model
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(), x_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.4f}".format(np.mean(scores)))

In [None]:
# cross validate for tuning the parameter together
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]} # smaller C stronger regularization (smaller budget/cost)
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(x_train, y_train)
print("Best cross-validation score: {:.4f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

In [None]:
# final evaluation
x_test = x_bow.transform(test['tidy_text'])
y_test = test['airline_sentiment_label']
print("Test score: {:.4f}".format(grid.score(x_test, y_test)))

In [None]:
# compare predicted with target
myResultDict = {'tidy_text':list(test['tidy_text']),'target':list(y_test),'predicted':list(grid.predict(x_test))}
pd.DataFrame(myResultDict)

In [None]:
# pipeline avoiding leaking information when doing feature selection
from sklearn.pipeline import make_pipeline
# prepare a pipeline with feature selection for each training fold and test fold
pipe = make_pipeline(CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english'),
                     LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(train['tidy_text'], y_train)
print("Best cross-validation score: {:.4f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

In [None]:
# final evaluation
print("Test score: {:.4f}".format(grid.score(test['tidy_text'], y_test)))

2.2 TF-IDF approach

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# prepare a pipeline with feature selection for each training fold and test fold
pipe = make_pipeline(TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english'),
                     LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(train['tidy_text'], y_train)
print("Best cross-validation score: {:.4f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

In [None]:
# final evaluation
print("Test score: {:.4f}".format(grid.score(test['tidy_text'], y_test)))

In [None]:
# feature exploration
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
# transform the training dataset:
x_train = vectorizer.transform(train['tidy_text'])
# find maximum value for each of the features over dataset:
max_value = x_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
# get feature names
feature_names = np.array(vectorizer.get_feature_names())

print("Features with lowest tfidf:\n{}".format(
      feature_names[sorted_by_tfidf[:20]]))

print("Features with highest tfidf: \n{}".format(
      feature_names[sorted_by_tfidf[-20:]]))

In [None]:
sorted_by_idf = np.argsort(vectorizer.idf_)
print("Features with lowest idf:\n{}".format(
       feature_names[sorted_by_idf[:100]]))

3. Exercises

3.1 Get the parameter tuning and evaluation results using TF-IDF with max_feature = 1500.

3.2 Get the parameter tuning and evaluation results using TF-IDF without stop_words (default).