In [None]:
import data
import random
import features
import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from scipy import stats
import pandas as pd
import numpy as np
from utils import go_to_project_root

In [None]:
def train_time_augmentation(ids):
    authors = data.get_raw_data(ids)
    ones = [author for author in authors.values() if author.truth == 1]
    zeros = [author for author in authors.values() if author.truth == 0]

    tweets_1 = []
    tweets_0 = []

    for z in zeros:
        for tweet in z.tweets:
            tweets_0 += [tweet]
    random.shuffle(tweets_0)

    for o in ones:
        for tweet in o.tweets:
            tweets_1 += [tweet]
    random.shuffle(tweets_1)

    for author in zeros:
        authors[author.author_id].tweets = []
        for i in range(100):
            authors[author.author_id].tweets += [tweets_0.pop(0)]

    for author in ones:
        authors[author.author_id].tweets = []
        for i in range(100):
            authors[author.author_id].tweets += [tweets_1.pop(0)]

    for i, author in enumerate(authors.keys()):
        authors[author].author_id = f"shuffled-{i + 1}"
        
    return authors

In [None]:
def test_time_augmentation(TestAuthor, n=3):
    # Get in an author object
    # Return a dict of author objects created from a subset of tweets
    Sub_Authors = {}
    for i in range(n):
        # Randomly shuffle the tweets of the author
        randomshuffle = random.shuffle(TestAuthor.tweets)
        
        #Save a new author object with half of the tweets
        Sub_Author[i] = Author(TestAuthor.author_id, randomshuffle[:50], TestAuthor.truth)
        
    return Sub_Authors

In [None]:
# First we take a .csv file with the author IDs and their truth values
df = pd.read_csv("IDs_names.csv")
X = df[:,0]
y = df[:,1]

PIPELINE_PATH = "data/processed/"

### Get the JSON files of the already extracted authors. Save it somewhere

kf = StratifiedKFold(n_splits=3,shuffle=True,random_state=69)

#Start counting so we know in which fold we are in
k = 1
for train_index, test_index in kf.split(X,y):
    print("Beginning k fold {}".format(k))
    
    ############################ TRAINING ##############################
    
    Train_Authors = data.get_processed_data(X[train_index]) #will return list of author IDs, how can we use this to determine which author files we take up
    y_train = y[train_index]
    
    print("Augmenting training data.")
    
    # Augment training data. Then extract the features for it
    augmentations = train_time_augmentation(X[train_index])
    
    # First extract the nonlinguistic features
    augmentations = features.extract_nonlinguistic_features(augmentations)

    # Extract semantic similarity
    augmentations = feature.extract_semantic_similarity(augmentations)

    # Get the lemmas
    augmentations = features.extract_clean_tweets(augmentations)

    # Lexical features -- TTR requires lemmas
    augmentations = features.extract_lexical_features(augmentations)

    # Get Named Entities
    augmentations = features.extract_named_entities(augmentations)

    # Get POS tags
    augmentations = features.extract_pos_tags(augmentations)

    # Count POSes and get adjectives
    augmentations = features.extract_POS_features(augmentations)

    # MORE FEATURES?
    Train_Authors.update(augmentations)
    
    # Cluster the Named Entities
    Train_Authors, ner_clusters = features.extract_mcts_ner(Train_Authors)
    
    # Cluster the adjectives
    Train_Authors, adj_clusters = features.extract_mcts_adj(Train_Authors)

    # Create dataframe of what
    train_df = preprocessing.convert_to_df(Train_Authors)
    
    
    train_df = train_df.drop('author_id', axis=1).to_numpy()
    X_train = train_df[:,:-1]
    
    go_to_project_root()
    
    with open(f"{path}{author.author_id}.json", "w") as file:
        file.writelines(convert_to_JSON(author))
        file.close()
        
    ### SAVE OUR X_TRAIN, Y_TRAIN, & CLUSTERS LISTS
    
    ############################ TESTING ##############################
    y_test = y[test_index]
    preds = []
    
    # First get all of the test authors
    Test_Authors = data.get_raw_data(X[test_index])
    
    # Now to augment the test data
    
    # Go through each test data point once at a time
    for author in test_index:
        Test3s_Authors = test_time_augmentation(Test_Authors[author])
    
        # We now have three datapoints for every author datapoint.

        # Extract test features

        # First extract the nonlinguistic features
        Test3s_Authors = features.extract_nonlinguistic_features(Test3s_Authors)

        # Extract semantic similarity
        Test3s_Authors = feature.extract_semantic_similarity(Test3s_Authors)

        # Get the lemmas
        Test3s_Authors = features.extract_clean_tweets(Test3s_Authors)

        # Lexical features -- TTR requires lemmas
        Test3s_Authors = features.extract_lexical_features(Test3s_Authors)

        # Get Named Entities
        Test3s_Authors = features.extract_named_entities(Test3s_Authors)

        # Cluster the Named Entities
        Test3s_Authors = features.extract_mcts_ner(Test3s_Authors) ## NEW FUNCTION HERE

        # Get POS tags
        Test3s_Authors = features.extract_pos_tags(Test3s_Authors)

        # Count POSes and get adjectives
        Test3s_Authors = features.extract_POS_features(Test3s_Authors)

        # Cluster the adjectives
        Test3s_Authors = features.extract_mcts_adj(Test3s_Authors) ## COMPARE TO TRAINED CLUSTERS FOR NEW MAKE NEW FUNCTION

        # MORE FEATURES?

        test_df = preprocessing.convert_to_df(Test3s_Authors)
        test_df = test_df.drop('author_id', axis=1).to_numpy()
        X_test = test_df[:,:-1]
        y_t3st = test_df[:,-1]
        
        ##### SAVE OUR EXTRACTED DATAPOINTS HERE######

        # TEST MODEL HERE NEED TO DETERMINE IT --> name classifications as pr3d

        # Reduce our augmented test data into one and append that to our prediction list
        preds.append(stats.mode(pr3d))
    
    # Print the accuracy of the entire validation set
    print(accuracy_score(preds,y_test)