In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import sys
import re
import random
from matplotlib import pyplot as plt
import data
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

In [23]:
stoplist = set(stopwords.words('english'))

authors = data.get_processed_data()

truths = [auth.truth for auth in list(authors.values())]
adjectives = [auth.adjectives for auth in list(authors.values())]

adj_str = []
for adj in adjectives:
    _str = " "
    for text, count in adj.items():
        _str += " " + (text + " ") * count
    adj_str += [_str.strip()]

'many  rare  prosecutorial  full  relevant  former former former  legal  ethical  only  cheap  2nd  fair  public public  retired  obvious  inadmissible  real  wrongful  vulgarian  wrong  latina  worst  evil'

In [41]:
def extract_mcts_adj(authors, n_models=50, k=3, threshold=1.0, _max_iter=5000, _n_init=5, data_split=0.6667):
    '''
    Extract the most common terms used by fake-news spreaders for the given authors.
    Trains an n_models number of models and takes the intersection of the most common terms used by fake news spreaders.
    Uses k-means and tf-idf

    Parameters:
        authors(Author dict):
            The authors to calculate the most common terms for

        n_models(int):
            The number of models to train
            Default is 50.

        k(int):
            The number of clusters for each model
            Default is 3

        _max_iter(int):
            The maximum number of iterations for each model
            Default is 5000

        _n_init(int):
            The number of initializations the k-means model will do. Returns the best one.
            Default is 5

        data_split(float):
            The splitting point between the testing set and training set for the data.
            Default is 60%
    '''

    print("Loading data...")

    # Get truth values
    truths = [auth.truth for auth in list(authors.values())]
    # Get ADJ values 
    adjectives = [auth.adjectives for auth in list(authors.values())]

    tweets = []
    for adj in adjectives:
        _str = " "
        for text, count in adj.items():
            _str += " " + (text + " ") * count
        tweets += [_str.strip()]

    # This will be the list of sets of most common adjectives used by fake news spreaders
    list_of_sets = []

    # The split of training vs. testing data
    split = int(len(tweets) * data_split)
    tweetsTrain = tweets[:split]
    tweetsTest = tweets[split:]

    print("Creating models...")
    while len(list_of_sets) < n_models:

        # Create model from the vectorizer
        vectorizer = TfidfVectorizer(stop_words='english')
        X = vectorizer.fit_transform(tweetsTrain)
        model = KMeans(n_clusters=k, algorithm="full", init='k-means++', max_iter=_max_iter, n_init=_n_init)
        model.fit(X)

        # Order the centroids 
        order_centroids = model.cluster_centers_.argsort()[:, ::-1]
        terms = vectorizer.get_feature_names()

        # Generate predictions based on the test set
        pred = np.zeros((len(tweetsTest), 2))
        for i, tw in enumerate(tweetsTest):
            X = vectorizer.transform([tw])
            predicted = model.predict(X)
            pred[i] = [int(predicted), truths[i + split]]

        # Fetch predictions results
        res = [(pred[np.where(pred[:, 1]==k)])[:,0] for k in range(2)]
        
        # Calculate the cluster with the maximum purity
        max_purity = 0 ; pure_cluster = -1
        for c in range(k):
            class_a = len(np.where(res[0]==c)[0])
            class_b = len(np.where(res[1]==c)[0])

            class_sum = class_a + class_b
            if class_sum != 0:
                purity = abs(0.5 - class_a / class_sum) * (class_sum/len(tweetsTest)) * 10
            else:
                purity = 0
            if purity > max_purity:
                pure_cluster = c
                max_purity = purity

        # Check if there is a cluster with very high purity
        if max_purity >= threshold:
            list_of_sets += [set()]
            if len(list_of_sets) % (n_models / 4) == 0 and len(list_of_sets) > 0:
                print(f"{len(list_of_sets) / (n_models) * 100}% of requested models trained")
            for ind in order_centroids[pure_cluster,:100]:
                # Add it to the list of sets
                list_of_sets[-1].add(terms[ind])
    
    # Calculate the final set as an intersection of all sets identified
    final_set = list_of_sets[0]
    for s in list_of_sets:
        final_set = final_set.intersection(s)

    for author in authors.keys():
        # Get cleaned values
        cleaned = " ".join(authors[author].clean)
        count = 0

        # Count the number of terms in each author
        for term in list(final_set):
            count += cleaned.count(re.sub("_", " ", term))

        # Save to author
        authors[author].most_common_adj_score = count

    return authors, final_set

In [42]:
stoplist = set(stopwords.words('english'))

authors = data.get_processed_data()

In [43]:
authors, final_set = extract_mcts_adj(authors, n_models=100, _max_iter=9999, _n_init=10)

Loading data...
Creating models...
25.0% of requested models trained
50.0% of requested models trained
75.0% of requested models trained
100.0% of requested models trained


In [44]:
final_set

{'american',
 'best',
 'big',
 'black',
 'dead',
 'democratic',
 'female',
 'free',
 'illegal',
 'little',
 'massive',
 'new',
 'old',
 'political',
 'red',
 'right',
 'true',
 'trump'}

In [49]:
for author in authors.values():
    data.exportJSON(author)

In [50]:
import preprocessing

In [52]:
preprocessing.convert_to_df(authors, True)

Unnamed: 0,max_similar,min_similar,mean_similar,number_identical,mcts_ner,mcts_adj,url_max,url_mean,hashtag_max,hashtag_mean,...,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X,TOKEN,truth
0,0.996152,0.554930,0.772765,0.0,28.0,41.0,0.0,0.00,0.0,0.00,...,0.28,0.52,2.52,0.73,0.28,0.03,2.17,0.00,13.60,1.0
1,0.999985,0.525862,0.827712,565.0,1.0,2.0,1.0,0.55,0.0,0.00,...,0.14,0.27,1.05,1.85,0.09,0.04,0.56,0.00,12.10,0.0
2,0.995010,0.432037,0.846995,2.0,3.0,72.0,3.0,2.00,4.0,0.74,...,0.27,0.09,4.21,2.43,0.01,0.01,1.00,0.00,13.38,1.0
3,0.942972,0.474534,0.784161,6.0,5.0,28.0,1.0,1.00,0.0,0.00,...,0.51,0.35,5.47,0.82,0.02,0.02,1.36,0.02,13.55,1.0
4,0.994065,0.459044,0.807674,0.0,7.0,69.0,2.0,1.30,1.0,0.02,...,0.37,0.28,2.20,1.48,0.19,0.00,1.74,0.00,14.32,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.997494,0.811975,0.910553,327.0,17.0,34.0,2.0,1.49,3.0,2.53,...,0.34,0.04,2.94,1.85,0.12,0.00,1.51,0.04,13.19,0.0
296,0.930048,0.592229,0.789775,72.0,0.0,18.0,1.0,1.00,2.0,0.06,...,0.33,0.37,2.03,1.27,0.11,0.03,1.23,0.00,12.65,1.0
297,0.995494,0.253093,0.821176,0.0,43.0,29.0,1.0,0.33,2.0,0.06,...,0.54,1.31,1.55,2.53,0.20,0.07,2.36,0.06,19.26,1.0
298,0.981666,0.806598,0.932583,447.0,9.0,5.0,2.0,1.87,3.0,2.56,...,0.18,0.07,4.11,0.76,0.02,0.02,1.15,0.00,9.01,0.0
