In [63]:
import pandas as pd

In [64]:
#files = ['twitter-2013train.txt','twitter-2015train.txt','twitter-2016train.txt']
files = ['twitter-2013train.txt','twitter-2015train.txt','twitter-2016train.txt']

In [65]:
df0, df1, df2 = [pd.read_csv(name, delimiter = '\t', header = None) for name in files]

In [66]:
data = pd.concat([df0, df1, df2], ignore_index=True) #concatinating the tweets data in 1 dataframe

In [67]:
df0.columns

Int64Index([0, 1, 2], dtype='int64')

In [68]:
data.columns = ['serial', 'opinion', 'tweet_text']

In [69]:
data.groupby(by = 'opinion').count()

Unnamed: 0_level_0,serial,tweet_text
opinion,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,2374,2374
neutral,6840,6840
positive,6827,6827


In [70]:
data.shape

(16041, 3)

In [71]:
data.head()

Unnamed: 0,serial,opinion,tweet_text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...


### Data Clean up & Pre-processing

In [72]:
# Step 1: casefold

import nltk

lowerTweets =[]
for tweet in data['tweet_text']:
    lowerTweets.append(tweet.casefold())
#lowerTweets[0:5]

In [73]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Baz-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [74]:
# Step 2: remove stopwords applying on all tweets 

from nltk.corpus import stopwords
import re

stops = set(stopwords.words("english"))
filtered_tweets =[]
for doc in lowerTweets:
    curr = ""
    for word in  re.split("\W+",doc):
        if word not in stops: 
            curr = curr + word +" "
    curr = curr.strip()
    filtered_tweets.append(curr)
#filtered_tweets[0:5]

In [75]:
# Step 3: Remove punctuation and digits from tweets and replace it by space
#### NOTE: Through different combinations, it is observed that accuracy is decreased after removing punctuation.

import string

def remove_punctuation(input_text):
    output = []
    for tweet in input_text:
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        output.append(tweet.translate(trantab))
    return output

def remove_digits(input_text):
    out_list = []
    for j in input_text:
        out_list.append(re.sub('\d+', '', j))
    return out_list

punctuation_removed_tweets = remove_punctuation(filtered_tweets)
punctuation_removed_tweets[0:5]

# Here we will skip removing the punctuation,
# and will use the "remove_digits" function with "filtered_tweets" (output of stopwords)
digits_removed_tweets = remove_digits(filtered_tweets)
#digits_removed_tweets[0:5]


In [76]:
# Step 4: Perform trimming to remove extra whitespaces:

spaces_removed_tweets = []
for j in digits_removed_tweets:
    spaces_removed_tweets.append(" ".join(j.split()))

#spaces_removed_tweets[0:5]

In [77]:
# Step 5: stemwords
from nltk.stem.porter import PorterStemmer

def stemDocs(f_docs):
    stemmed_docs =[]
    for doc in  f_docs:
        curr = ""
        for word in doc.split():  
            curr = curr + PorterStemmer().stem(word) +" "
        curr = curr.strip()
        stemmed_docs.append(curr)
    return  stemmed_docs
    
stemmed_tweets = stemDocs(spaces_removed_tweets)
#stemmed_tweets[0:5]

### After multiple trials with different combinations, the highest accuracy (64.99%) is reached through the below steps:

### 1- Casefolding
### 2- Remove stopwords
### 3- Remove digits
### 4- Trimming (remove whitespaces)

In [78]:
data['stemmed_tweet'] = stemmed_tweets
data['non_stemmed_tweet'] = spaces_removed_tweets
data.head()

Unnamed: 0,serial,opinion,tweet_text,stemmed_tweet,non_stemmed_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...,ga hous hit um go chapel hill sat,gas house hit um going chapel hill sat
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...,theo walcott still shit uc watch rafa johnni d...,theo walcott still shit uc watch rafa johnny d...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...,um gsp fan uc hate nick diaz ut wait februari,um gsp fan uc hate nick diaz ut wait february
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...,iranian gener say israel us iron dome ut deal ...,iranian general says israel us iron dome ut de...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...,tehran uc mon amour obama tri establish tie mu...,tehran uc mon amour obama tried establish ties...


In [79]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

accuracies = []
f1_scores = []

### Step 2. Features extraction
#### A. Trying Word embedding on the preprocessed Non stemmed data

In [80]:
import nltk
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors

In [81]:
#word embedding model build
model = KeyedVectors.load_word2vec_format('E:/NU BDDS-PD/Practical Data Mining/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

MemoryError: 

In [None]:
#transforming the tweets to word embedding vectors using Google news W2V (300)
vectorized_tweets = []
for i in range(len(data.non_stemmed_tweet)):
    #print(i)
    tweets = []
    words = data.non_stemmed_tweet[i].split()
    length = len(words)
    #print(type(words))
    vector = 0
    for word in words:
        if word in model:
            #print("yes")
            vector += np.array(model[word])
            #print(vector)
        else:
            vector += np.zeros(300)
            #print("no")
            continue
    #print(vector)
    tweets = list(vector/length)
    #print(type(tweets))
    vectorized_tweets = np.append(vectorized_tweets, tweets)

In [None]:
w2v_tweets = np.reshape(vectorized_tweets, (-1,300), 'a')
np.shape(w2v_tweets)
#len(vectorized_tweets)/300

w2v_tweets = pd.DataFrame(w2v_tweets)

#### Splitting w2v_tweets into train & test

In [None]:
# Use 70% of the provided data as training data and the remaining 30% to test a classifier 
from sklearn.model_selection import train_test_split


tweets_train,tweets_test,train_labels,test_labels = train_test_split(w2v_tweets,                   
                                                 data['opinion'], test_size=0.3,
                                                 random_state=0)

In [None]:
from sklearn.preprocessing import MinMaxScaler #will be used to scale the data between (0,1) to avoid -ve input

In [None]:
scaler = MinMaxScaler()

scaled_tweets_train = scaler.fit_transform(tweets_train)
scaled_tweets_test = scaler.fit_transform(tweets_test)

#### Splitting Non stemmed tweets into train & test and apply CountVectorizer to it

In [None]:
# Use 70% of the provided data as training data and the remaining 30% to test a classifier 
from sklearn.model_selection import train_test_split

tweets_train,tweets_test,train_labels,test_labels = train_test_split(data["non_stemmed_tweet"],                   
                                                 data['opinion'], test_size=0.3,
                                                 random_state=0)

In [None]:
# CountVectorizer:

vectorizer = CountVectorizer().fit(tweets_train)

# Training Dataset:
tweets_train_vectorized = vectorizer.transform(tweets_train)

# Test Dataset:
tweets_test_vectorized = vectorizer.transform(tweets_test)

#### Merging Training Dataset of Countvectorizer "tweets_train_vectorized" & Scaled Training Dataset of Word Embeddings "scaled_tweets_train"

In [None]:
tweets_train_vectorized_df = pd.DataFrame(tweets_train_vectorized.toarray()) # convert scipy.sparse.csr.csr_matrix to pandas df
#tweets_train_vectorized_df.shape
tweets_train_vectorized_df.iloc[0:6,25285:25290] # print the last 5 columns of "tweets_train_vectorized_df"

In [None]:
#scaled_tweets_train_df = DataFrame(data=scaled_tweets_train, index=scaled_tweets_train.index)
scaled_tweets_train_df = pd.DataFrame(data=scaled_tweets_train)
scaled_tweets_train_df.head()
scaled_tweets_train_df.iloc[0:6,295:300] # print the last 5 columns of "scaled_tweets_train_df"

In [None]:
print(tweets_train_vectorized_df.shape)
print(scaled_tweets_train_df.shape)

In [None]:
# Merging tweets_train_vectorized_df & scaled_tweets_train_df:

df_concat_train = pd.concat([tweets_train_vectorized_df, scaled_tweets_train_df], axis=1, ignore_index=True)
df_concat_train.head()

In [None]:
print(df_concat_train.iloc[0:6,25285:25290])   # print the last 5 columns of "tweets_train_vectorized_df" after merging
print(df_concat_train.iloc[0:6,25585:25590])   # print the last 5 columns of "scaled_tweets_train_df" after merging

In [None]:
scaled_tweets_train_df.iloc[0:6,0:6]   # print the first 5 columns of "scaled_tweets_train_df" BEFORE merge
df_concat_train.iloc[0:6,25285:25296]   # print the first 5 columns of "scaled_tweets_train_df" AFTER merge

#### Merging Test Dataset of Countvectorizer "tweets_test_vectorized" & Scaled Test Dataset of Word Embeddings "scaled_tweets_test"

In [None]:
# Merging tweets_test_vectorized_df & scaled_test_train_df:

tweets_test_vectorized_df = pd.DataFrame(tweets_test_vectorized.toarray())
scaled_tweets_test_df = pd.DataFrame(data=scaled_tweets_test)

print(tweets_test_vectorized_df.shape)
print(scaled_tweets_test_df.shape)

In [None]:
scaled_tweets_test_df.iloc[0:6,295:300]

In [None]:
df_concat_test = pd.concat([tweets_test_vectorized_df, scaled_tweets_test_df], axis=1, ignore_index=True)
df_concat_test.head()

### Use the Merged data with Logistic Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np

In [None]:
# Create a Logistic Regression classifier:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics

clfr=BaggingClassifier(base_estimator=LogisticRegression(C=0.3359818286283781), random_state=1,n_estimators=100)

clfr.fit(df_concat_train,train_labels)

predicted = clfr.predict(df_concat_test)
acc = metrics.accuracy_score(test_labels,predicted)

print ('Accuracy of Merged Data (Word Embdeddings & CountVectorizer) + Logistic Regression (Non Stemmed Tweets) = '+str(acc*100)+'%')
print (metrics.classification_report(test_labels,predicted))
accuracies.append(('Accuracy of Merged Data (Word Embdeddings & CountVectorizer) + Logistic Regression (Non Stemmed Tweets)', acc*100))
f1_scores.append(('F1-score of Merged Data (Word Embdeddings & CountVectorizer) + Logistic Regression (Non Stemmed Tweets)', metrics.f1_score(test_labels,predicted, average='weighted')))

### Read the provided Test Dataset

In [None]:

files_test = ['test.csv']

In [None]:
files_test

In [None]:
data_test = pd.read_csv('test.csv')
#df3, df4 = [pd.read_csv(name, delimiter = '\t', header = None) for name in files_test]

In [None]:
data_test .head()

In [None]:
data_test.shape

In [None]:
data_test.columns

### Test Data Clean Up & Pre-processing

In [None]:
# Step 1: casefold

import nltk

lowerTweets_test =[]
for tweet in data_test['tweet']:
    lowerTweets_test.append(tweet.casefold())
lowerTweets_test[0:5]

In [None]:
# Step 2: remove stopwords applying on all tweets 

from nltk.corpus import stopwords
import re

stops = set(stopwords.words("english"))
filtered_tweets_test =[]
for doc in lowerTweets_test:
    curr = ""
    for word in  re.split("\W+",doc):
        if word not in stops: 
            curr = curr + word +" "
    curr = curr.strip()
    filtered_tweets_test.append(curr)
filtered_tweets_test[0:5]

In [None]:
# Step 3: Remove punctuation and digits from tweets and replace it by space
#### NOTE: Through different combinations, it is observed that accuracy is decreased after removing punctuation.

import string

def remove_punctuation(input_text):
    output = []
    for tweet in input_text:
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        output.append(tweet.translate(trantab))
    return output

def remove_digits(input_text):
    out_list = []
    for j in input_text:
        out_list.append(re.sub('\d+', '', j))
    return out_list

#punctuation_removed_tweets = remove_punctuation(filtered_tweets)
#punctuation_removed_tweets[0:5]

# Here we will skip removing the punctuation,
# and will use the "remove_digits" function with "filtered_tweets" (output of stopwords)
digits_removed_tweets_test = remove_digits(filtered_tweets_test)
digits_removed_tweets_test[0:5]

In [None]:
# Step 4: Perform trimming to remove extra whitespaces:

spaces_removed_tweets_test = []
for j in digits_removed_tweets_test:
    spaces_removed_tweets_test.append(" ".join(j.split()))

spaces_removed_tweets_test[0:5]

In [None]:
#data['stemmed_tweet'] = stemmed_tweets
data_test['preprocessed_tweet'] = spaces_removed_tweets_test
data_test.head()

### Apply both CountVectorizer & Word Embeddings on Test Data

#### 1.  Applying CountVectorizer Provided Test Data

In [None]:
# Apply CountVectorizer on Test Dataset:

new_tweets_test_vectorized = vectorizer.transform(data_test['preprocessed_tweet'])
new_tweets_test_vectorized
#vectorizer

#### 2.  Applying W2V on Provided Test Data

In [None]:
# Transforming the w2v on the TEST TWEETS to word embedding vectors using Google news W2V (300):

vectorized_test_tweets = []
for i in range(len(data_test.preprocessed_tweet)):
    #print(i)
    tweets = []
    words = data_test.preprocessed_tweet[i].split()
    length = len(words)
    #print(type(words))
    vector = 0
    for word in words:
        if word in model:
            #print("yes")
            vector += np.array(model[word])
            #print(vector)
        else:
            vector += np.zeros(300)
            #print("no")
            continue
    #print(vector)
    tweets = list(vector/length)
    #print(type(tweets))
    vectorized_test_tweets = np.append(vectorized_test_tweets, tweets)
    


In [None]:
w2v_test_tweets = np.reshape(vectorized_test_tweets, (-1,300), 'a')
np.shape(w2v_test_tweets)
#len(vectorized_tweets)/300

w2v_test_tweets = pd.DataFrame(w2v_test_tweets)
#w2v_test_tweets

In [None]:
# Scaling w2v_test_tweets:
scaler = MinMaxScaler()
scaled_test_tweets = scaler.fit_transform(w2v_test_tweets)


#### 3.  Merging Vectorized Test Dataset of Countvectorizer "new_tweets_test_vectorized" & Scaled Test Dataset of Word Embeddings "scaled_test_tweets"

In [None]:
new_tweets_test_vectorized_df = pd.DataFrame(new_tweets_test_vectorized.toarray()) # convert scipy.sparse.csr.csr_matrix to pandas df

In [None]:
scaled_test_tweets_df = pd.DataFrame(data=scaled_test_tweets)

In [None]:
# Merging new_tweets_test_vectorized_df & scaled_test_tweets_df:

df_merged_test = pd.concat([new_tweets_test_vectorized_df, scaled_test_tweets_df], axis=1, ignore_index=True)
df_merged_test.shape

### Apply Logistic Regression on the Megred Test Data

In [None]:
# Create a Logistic Regression classifier:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

clfr = LogisticRegression(C=0.3359818286283781)
clfr.fit(df_concat_train,train_labels)

predicted_test = clfr.predict(df_merged_test)

In [None]:
predicted_test
print(len(predicted_test))
print(predicted_test[0:11])

In [None]:
import numpy as np

label_list = []
for prediction in range(len(predicted_test)):
    if predicted_test[prediction] == 'neutral':
        label_list.append(0)
    if predicted_test[prediction] == 'positive':
        label_list.append(1)
    if predicted_test[prediction] == 'negative':
        label_list.append(2)

print(len(label_list))
print(label_list[0:11])

label_array = np.array(label_list)

In [None]:
result = data_test[['id']].copy()
result['label'] = pd.DataFrame(data=label_array)
result.head(10)
result.to_csv("CountVec_word2vec_bagging_smalldata.csv", index=False)