# task1. preprocessing tweets
### load datasets

In [21]:
import pandas as pd
import numpy as np

In [11]:
tweetsData = pd.read_csv("all_annotated.tsv", sep = "\t")


In [12]:
tweetsData

Unnamed: 0,Tweet ID,Country,Date,Tweet,Definitely English,Ambiguous,Definitely Not English,Code-Switched,Ambiguous due to Named Entities,Automatically Generated Tweets
0,434215992731136000,TR,2014-02-14,Bugün bulusmami lazimdiii,0,0,1,0,0,0
1,285903159434563584,TR,2013-01-01,Volkan konak adami tribe sokar yemin ederim :D,0,0,1,0,0,0
2,285948076496142336,NL,2013-01-01,Bed,1,0,0,0,0,0
3,285965965118824448,US,2013-01-01,I felt my first flash of violence at some fool...,1,0,0,0,0,0
4,286057979831275520,US,2013-01-01,Ladies drink and get in free till 10:30,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
10497,774941788247298050,TR,2016-09-11,"I'm at @PiazzaAvym in Canik, Samsun w/ @mertar...",1,0,0,0,0,1
10498,774951242422480897,PH,2016-09-11,"El Nido, fica ao norte de Palawan, uma das ilh...",0,0,1,0,0,0
10499,774960083721531392,ID,2016-09-11,Alhamdulillah 😊👨‍👩‍👧‍👧🎂🍦makasih mah pah #lovyu...,0,0,1,0,0,0
10500,775057244798849024,NG,2016-09-11,Eid-Mubarak @ Bauchi Fedral Lowcost https://t....,0,1,0,0,1,0


### Preprocessing for tweets 

In [13]:
remove_countries  = list(tweetsData['Country'].value_counts()[:20].to_dict().keys())

In [14]:
tweetsData = tweetsData[tweetsData['Country'].isin(remove_countries)]
tweetsData.reset_index(drop=True,inplace=True)

In [15]:
tweetsData['Country'].value_counts()

Country
US    2966
BR    1195
ID    1099
TR     624
JP     505
GB     481
MY     395
ES     340
AR     312
FR     265
PH     233
MX     229
TH     168
RU     164
CA     121
IT      95
CL      90
NL      83
ZA      67
CO      62
Name: count, dtype: int64

In [16]:
x = tweetsData['Tweet'].values
y = tweetsData['Country'].values

In [17]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import re

tt = TweetTokenizer()
stopwords = set(stopwords.words('english')) #note: stopwords are all in lowercase

def preprocess_data(data, labels):
    cleaned_tokens = []
    cleaned_labels = []
    
    for string, label in zip(data, labels):
        dic = {}
        tokens = tt.tokenize(string)    # step1. tokenize a tweet data
        for token in tokens:
            token = token.lower()       # step2. lowercase words
            
            # step3 & step4. save words including any English alphabets and not belonging to stopwords 
            if (re.search("[a-z]+",token) and not token in stopwords):   
                dic[token] = dic.get(token,0) + 1  
                
        # save preprocessed tokens with their label if any token exists
        if len(dic) > 0:
            cleaned_tokens.append(dic)
            cleaned_labels.append(label)
    
    return cleaned_tokens, cleaned_labels    

x_processed, y_processed = preprocess_data(x, y)

print("Number of preprocessed tweets =", len(x_processed))
print("Number of preprocessed labels =", len(y_processed))
print("\nSamples of preprocessed data:")
for i in range(10):
    print("Country =", y_processed[i], "\tTweet =", x_processed[i])

Number of preprocessed tweets = 9303
Number of preprocessed labels = 9303

Samples of preprocessed data:
Country = TR 	Tweet = {'bugün': 1, 'bulusmami': 1, 'lazimdiii': 1}
Country = TR 	Tweet = {'volkan': 1, 'konak': 1, 'adami': 1, 'tribe': 1, 'sokar': 1, 'yemin': 1, 'ederim': 1, ':d': 1}
Country = NL 	Tweet = {'bed': 1}
Country = US 	Tweet = {'felt': 1, 'first': 1, 'flash': 1, 'violence': 1, 'fool': 2, 'bumped': 1, 'pity': 1}
Country = US 	Tweet = {'ladies': 1, 'drink': 1, 'get': 1, 'free': 1, 'till': 1}
Country = NL 	Tweet = {'@melanynijholtxo': 1, 'ahhahahahah': 1, 'dm': 1}
Country = US 	Tweet = {'fuck': 1}
Country = GB 	Tweet = {'watching': 1, '#miranda': 1, 'bbc': 1, '@mermhart': 1, 'u': 1, 'r': 1, 'hilarious': 1}
Country = US 	Tweet = {'shopping': 1, "kohl's": 1, 'http://t.co/i8zkqht9': 1}
Country = MX 	Tweet = {'@mizzh_': 1, 'celos': 1, 'es': 1, 'tu': 1, 'segundo': 1, 'nombre': 1}


In [18]:
# test block
assert(len(x_processed) == len(y_processed))
assert(len(x_processed) > 800)

# Task2. text Classification

### split datasets

In [19]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

# initialise the objects
x_train, x_dev, x_test = None, None, None
y_train, y_dev, y_test = None, None, None

# split datasets into 70:15:15 with tha same distribution
x_train, x_temp, y_train, y_temp = train_test_split(x_processed, y_processed, test_size = 0.3, \
                                                    stratify = y_processed, random_state = 1)
x_dev, x_test, y_dev, y_test = train_test_split(x_temp, y_temp, test_size = 0.5, \
                                                stratify = y_temp, random_state = 1)

# vectorize tokens 
vectorizer = DictVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_dev = vectorizer.transform(x_dev)
x_test = vectorizer.transform(x_test)

#### Naive Bayes and Logistic Regression

In [27]:
x_train

<6512x23704 sparse matrix of type '<class 'numpy.float64'>'
	with 46917 stored elements in Compressed Sparse Row format>

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# tune hyper-paramters for MultinomialNB 
best_acc_NB = 0    
best_clfNB = None
alphas = np.logspace(-3, 2, num = 6)  

print("===========================\nMultinomialNB")
for a in alphas:
    clfNB = MultinomialNB(alpha = a)
    clfNB.fit(x_train,y_train)
    prediction_NB = clfNB.predict(x_dev)
    acc_NB = accuracy_score(y_dev,prediction_NB)
    print("alpha", a, "w/ accuracy", acc_NB)
    if acc_NB >= best_acc_NB:
        best_acc_NB = acc_NB
        best_clfNB = clfNB
    
# tune hyper-paramters for LogisticRegression
best_acc_LR = 0      
best_clfLR = None
# solver = ['newton-cg', 'liblinear','lbfgs']   
# penalty = ['none', 'l1', 'l2']
C = [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 50, 100]    

print("===========================\nLogisticRegression")
for c in C:
    clfLR = LogisticRegression(C = c)
    clfLR.fit(x_train,y_train)
    prediction_LR = clfLR.predict(x_dev)
    acc_LR = accuracy_score(y_dev,prediction_LR)
    print("C", c, "w/ accuracy", acc_LR)
    if acc_LR >= best_acc_LR:
        best_acc_LR = acc_LR
        best_clfLR = clfLR

# print the optimal hyperparameters
print("\n\nbest classifier of mutinomial naive bayes model : %s with accuracy (%.3f)" \
      % (best_clfNB,best_acc_NB))                
print("best classifier of logistic regression model : %s with accuracy (%.3f)" \
      % (best_clfLR, best_acc_LR))

MultinomialNB
alpha 0.001 w/ accuracy 0.6652329749103942
alpha 0.01 w/ accuracy 0.6551971326164875
alpha 0.1 w/ accuracy 0.6587813620071684
alpha 1.0 w/ accuracy 0.628673835125448
alpha 10.0 w/ accuracy 0.5068100358422939
alpha 100.0 w/ accuracy 0.374910394265233
LogisticRegression
C 0.001 w/ accuracy 0.31684587813620074
C 0.01 w/ accuracy 0.38853046594982077
C 0.1 w/ accuracy 0.5189964157706093
C 0.5 w/ accuracy 0.5870967741935483


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C 1 w/ accuracy 0.5978494623655914


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C 5 w/ accuracy 0.607168458781362


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C 10 w/ accuracy 0.603584229390681


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C 50 w/ accuracy 0.603584229390681
C 100 w/ accuracy 0.6014336917562724


best classifier of mutinomial naive bayes model : MultinomialNB(alpha=0.001) with accuracy (0.665)
best classifier of logistic regression model : LogisticRegression(C=5) with accuracy (0.607)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### performance evaluation with the best parameters

In [23]:
# import library
from sklearn.metrics import f1_score, classification_report

# fit the training data into the best Naive Bayes classifier and predict the label of test data
best_clfNB.fit(x_train,y_train)          
nb_prediction = best_clfNB.predict(x_test)

print("\n>> MultinomialNB Model Results <<\n")
print("Accuracy :", round(accuracy_score(y_test,nb_prediction), 3))
print("Macro Avg. F-score :", round(f1_score(y_test,nb_prediction, average='macro'), 3))
print(classification_report(y_test, nb_prediction))


# fit the training data into the best Logistic Regression classifier and predict the label of test data
best_clfLR.fit(x_train,y_train)
lr_prediction = best_clfLR.predict(x_test)
       
print("Accuracy :", round(accuracy_score(y_test,lr_prediction), 3))
print("Macro Avg. F-score :", round(f1_score(y_test,lr_prediction, average='macro'),3))
print(classification_report(y_test, lr_prediction))



>> MultinomialNB Model Results <<

Accuracy : 0.65
Macro Avg. F-score : 0.437
              precision    recall  f1-score   support

          AR       0.41      0.49      0.45        47
          BR       0.88      0.84      0.86       178
          CA       0.00      0.00      0.00        18
          CL       0.33      0.23      0.27        13
          CO       0.00      0.00      0.00         9
          ES       0.31      0.25      0.28        51
          FR       0.92      0.56      0.70        39
          GB       0.28      0.18      0.22        72
          ID       0.83      0.74      0.78       165
          IT       0.83      0.36      0.50        14
          JP       0.61      0.44      0.51        64
          MX       0.36      0.29      0.32        34
          MY       0.70      0.47      0.57        59
          NL       0.67      0.50      0.57        12
          PH       0.61      0.49      0.54        35
          RU       0.25      0.11      0.15        18
  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'\nAccording to the result, LR model performs better with test dataset\n'

<1x23704 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>