# Classification_Test.ipynb

### This notebook contains the feature generation, supervised machine learning and validation for automatic classification of Tweets.

Author: Erik Puijk <br>
Date  : March 28, 2022

In [1]:
pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /home/erik/anaconda3/lib/python3.8/site-packages (1.0.2)
Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import json, csv, math, random
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
from sklearn import svm
import pyperclip as pc

In [2]:
Encoder = LabelEncoder()

In [3]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    #print("Total Tweets read: %s\n" % (len(content)))

    return content

In [4]:
def write_tweets(tweets_w, path):
    """ Write obtained Tweets to a text file in JSON-format. """
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets_w, f)
    except IOError:
        print("I/O error")

In [5]:
def stats_count(cat):
    """ Count the occurrences of each category in a list of categories. """
    
    occurrences = [[x,cat.count(x)] for x in set(cat)]
    
    print('Number of training data')
    
    for occ in occurrences:
        print('%s: %s' % (occ[0], occ[1]))
        
    print('')

In [6]:
def calc_avg_scores(scores, cats, k):
    """ Calculate an average score given a list of scores. """
    
    scores_avg = []
    
    # If a list of lists of scores is given, calculate the average per label
    if isinstance(scores[0], list): 
        
        for i in range(len(cats)):

            total = 0

            for j in range(k):
                total += scores[j][i]

            scores_avg.append(total / k)
            
    # If a list of average (f1) scores is given, just return that list
    else:
        return scores

    return scores_avg

In [7]:
def ml(X_train, y_train, X_test, v_type, alg, ngram_min, ngram_max, min_df, max_df, max_features):
    """ Vectorize training and test set according to specific vectorizer and fit and predict using a specific
        SVM. """
    
    # Select appropriate vectorizer
    if v_type == 'count':
        vectorizer = CountVectorizer(analyzer='word', ngram_range=(ngram_min, ngram_max), min_df=min_df, \
                                     max_df=max_df, max_features=max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=(ngram_min, ngram_max), min_df=min_df, max_df=max_df, \
                                     max_features=max_features, sublinear_tf=True)

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Create and fit SVM
    if alg == 'svc':
        clf = svm.SVC(kernel='linear', C=0.6, class_weight='balanced')
    else:
        clf = svm.LinearSVC(C=0.5, class_weight='balanced', random_state=0)

    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    
    return pred

In [34]:
def test_configurations(v_type, tweets, labels, alg):
    """ Test different configurations for different models and compare the micro-f1 scores to select the best
        model. """
    
    cats = list(set(labels))
    i_max = 1
    scores = []
    avg_scores = []
    
    # Define model parameters according to which categorization
    if len(cats) == 3:
        params = {
            "ngram_min": 1,
            "ngram_max": 2,
            "min_df": 1,
            "max_df": 0.4,
            "max_features": 3000
        }
    else:
        params = {
            "ngram_min": 1,
            "ngram_max": 2,
            "min_df": 1,
            "max_df": 0.4,
            "max_features": 450
        }
    
    for i in range(0, i_max):
        # Split data set into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.3, random_state=i+5)

        stats_count(list(y_test))

        # Encode labels
        y_train = Encoder.fit_transform(y_train)
        y_test = Encoder.fit_transform(y_test)

        pred = ml(X_train, y_train, X_test, v_type, alg, params['ngram_min'], params['ngram_max'], \
                  params['min_df'], params['max_df'], params['max_features'])
        
        scores.append(list(f1_score(y_test, pred, average=None, zero_division=0)))

        print(classification_report(y_test, pred, target_names=Encoder.inverse_transform(list(set(y_test))), \
                                       zero_division=0))

    avg_scores = calc_avg_scores(scores, cats, i_max)
    
    print("CAM: " + str(round(avg_scores[0], 2)))
    print("POL: " + str(round(avg_scores[1], 2)))
    print("SOC: " + str(round(avg_scores[2], 2)))
    
    #print("CON: " + str(round(avg_scores[0], 2)))
    #print("FOL: " + str(round(avg_scores[1], 2)))
    #print("NON: " + str(round(avg_scores[2], 2)))
    #print("SUP: " + str(round(avg_scores[3], 2)))

In [35]:
# Read all Tweets
tweets_con = read_tweets('source/tweets_all_preprocessed_exc_stopwords.txt')
tweets_act = read_tweets('source/tweets_all_preprocessed_exc_stopwords.txt')

# Select gold-standard (labeled) Tweets
tweets_con_gs = [tweet for tweet in tweets_con if tweet['memo'] == 'gold_standard']
tweets_act_gs = [tweet for tweet in tweets_act if tweet['memo'] == 'gold_standard']

# Select the text from those Tweets
tweets_con_text = [tweet['text'] for tweet in tweets_con_gs]
tweets_act_text = [tweet['text'] for tweet in tweets_act_gs]

# Select the labels from those Tweets
labels_con = [tweet['cat_con'] for tweet in tweets_con_gs]
labels_act = [tweet['cat_act'] for tweet in tweets_act_gs]

test_configurations('count', tweets_con_text, labels_con, 'linear_svc')
#test_configurations('tfidf', tweets_act_text, labels_act, 'svc')

Number of training data
SOC: 11
CAM: 33
POL: 96

              precision    recall  f1-score   support

         CAM       0.74      0.61      0.67        33
         POL       0.84      0.84      0.84        96
         SOC       0.41      0.64      0.50        11

    accuracy                           0.77       140
   macro avg       0.67      0.70      0.67       140
weighted avg       0.79      0.77      0.78       140

CAM: 0.67
POL: 0.84
SOC: 0.5
