# MachineLearning.ipynb

### This notebook contains the feature generation, supervised machine learning and validation for automatic classification of Tweets.

Author: Erik Puijk <br>
Date  : March 28, 2022

In [198]:
pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[K     |████████████████████████████████| 26.7 MB 4.7 MB/s eta 0:00:01
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.2
    Uninstalling scikit-learn-0.23.2:
      Successfully uninstalled scikit-learn-0.23.2
Successfully installed scikit-learn-1.0.2
Note: you may need to restart the kernel to use updated packages.


In [267]:
import json
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn import metrics

In [268]:
# https://scikit-learn.org/stable/modules/svm.html
# https://machinelearningmastery.com/overfitting-and-underfitting-with-machine-learning-algorithms/
# https://vitalflux.com/hold-out-method-for-training-machine-learning-model/
# https://towardsdatascience.com/cross-validation-in-machine-learning-72924a69872f
# https://scikit-learn.org/stable/modules/cross_validation.html
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [269]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    print("Total Tweets read: %s\n" % (len(content)))

    return content

In [270]:
def write_tweets(tweets_w, path):
    """ Write obtained Tweets to a text file in JSON-format. """
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets_w, f)
    except IOError:
        print("I/O error")

In [271]:
def stats_count(cat):
    
    print('POL: %s | %s' % (round((cat.count('POL') / len(cat) * 100)), 67))
    print('CAM: %s | %s' % (round((cat.count('CAM') / len(cat) * 100)), 26))
    print('SOC: %s | %s' % (round((cat.count('SOC') / len(cat) * 100)), 7))
    print('')
    print('NONE: %s | %s' % (round((cat.count('NONE') / len(cat) * 100)), 63))
    print('SUP: %s | %s' % (round((cat.count('SUP') / len(cat) * 100)), 15))
    print('CON: %s | %s' % (round((cat.count('CON') / len(cat) * 100)), 3))
    print('FOL: %s | %s' % (round((cat.count('FOL') / len(cat) * 100)), 18))

In [278]:
def classify(tweets, cat):
    
    # Split data set into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tweets, cat, test_size=0.3, random_state=0)
        
    # Encode labels
    Encoder = LabelEncoder()
    y_train = Encoder.fit_transform(y_train)
    y_test = Encoder.fit_transform(y_test)
    
    Tfidf_vect = TfidfVectorizer(max_features=3000)
    Tfidf_vect.fit(tweets)
    
    X_train = Tfidf_vect.transform(X_train)
    X_test = Tfidf_vect.transform(X_test)
    
    clf = svm.SVC(kernel='linear', C=1)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    
    scores = metrics.f1_score(y_test, pred, average='micro')
    #scores = clf.score(X_test, y_test)

    print(scores)
    
    #SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    #SVM.fit(X_train, y_train)
    #predictions_SVM = SVM.predict(X_test)
    #print(*list(zip(Encoder.inverse_transform(y_test), Encoder.inverse_transform(predictions_SVM))), sep='\n')
    #print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, y_test)*100)

In [279]:
tweets_r = read_tweets('source/tweets_all_preprocessed.txt')

tweets_tokens = [tweet_r['text'].split() for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']
tweets = [tweet_r['text'] for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']
cat_con = [tweet_r['cat_con'] for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']
cat_act = [tweet_r['cat_act'] for tweet_r in tweets_r if tweet_r['memo'] == 'gold_standard']

classify(tweets, cat_con)
classify(tweets, cat_act)

Total Tweets read: 4664

0.8142857142857143
0.7285714285714285
