In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import sys
import seaborn as sns
import os
import matplotlib.pyplot as plt
import random
random.seed(0)

# https://mjeensung.github.io/characterbigramtfidf/

# Load data
drive.mount('/content/gdrive', force_remount=True)
data_dir = '../content/gdrive/MyDrive/544/data'

data_path = os.path.join(data_dir, "preprocessed_data.csv")
df = pd.read_csv(data_path, encoding="ISO-8859-1")

# TODO: Is taking REF's in the corpus a problem?

Mounted at /content/gdrive


In [2]:
# Select Users
dataset_users = df.groupby('user')["text"].count()
dataset_users = dataset_users.sort_values(ascending=False)
other_users = dataset_users.iloc[50:75].index.values
dataset_users = dataset_users.iloc[:50].index.values

print(f"Dataset size: {len(df)}")
df_other = df.loc[df.user.isin(other_users)]
df = df.loc[df.user.isin(dataset_users)]
print(f"Dataset size with selected users: {len(df)}")
print(f"Dataset size with other users: {len(df_other)}")

# Shuffle df
df = df.sample(len(df), random_state=42)

# Train val test split
n_train = 75
tweet_i_user = df.groupby("user").cumcount()
df["set_name"] = None
df.loc[tweet_i_user < 75, "set_name"] = "train"
df.loc[(tweet_i_user < 110) & (tweet_i_user >= 75), "set_name"] = "val"
df.loc[(tweet_i_user >= 110) & (tweet_i_user < 130), "set_name"] = "test"
df = df.reset_index(drop=True)

Dataset size: 1451758
Dataset size with selected users: 8641
Dataset size with other users: 3157


In [3]:
df.loc[df.user == "Dutchrudder"]

Unnamed: 0,target,ids,date,flag,user,text,n_words,set_name
128,4,2055400325,Sat Jun 06 09:14:49 PDT 2009,NO_QUERY,Dutchrudder,REF LOL Christ she's too loud you could never...,21,train
142,4,1573785723,Tue Apr 21 02:51:01 PDT 2009,NO_QUERY,Dutchrudder,"REF LOL I know dude,, she's NUM on NUMth of ma...",20,train
155,0,1824688519,Sun May 17 02:29:25 PDT 2009,NO_QUERY,Dutchrudder,REF I diddent start it!!! It wasent me !!! Ju...,18,train
386,4,2003188454,Tue Jun 02 06:03:25 PDT 2009,NO_QUERY,Dutchrudder,REF what???? The sun gives u a funny feeling i...,10,train
394,0,1882470273,Fri May 22 06:55:55 PDT 2009,NO_QUERY,Dutchrudder,"Just rang my mum,, she's good but this new che...",24,train
...,...,...,...,...,...,...,...,...
8369,4,1556667529,Sat Apr 18 23:47:42 PDT 2009,NO_QUERY,Dutchrudder,"REF the most fun i'v had on NUM wheels, handle...",16,
8444,4,1881371605,Fri May 22 04:28:06 PDT 2009,NO_QUERY,Dutchrudder,REF I think she would!!!! Supernannys a kinky ...,7,
8484,4,1881385208,Fri May 22 04:30:31 PDT 2009,NO_QUERY,Dutchrudder,TAG REF she sprays up Walls like a sexy cat,8,
8523,4,1990434293,Mon Jun 01 04:51:41 PDT 2009,NO_QUERY,Dutchrudder,"REF LOL we still ere dude,, just been NUM busy...",11,


In [4]:
# Create dataset arrays
from sklearn.feature_extraction.text import TfidfVectorizer

# # TODO: test other max_features values
# vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(4,4), max_features=5000).fit(df.text.values)

# # Get dataset features
# X_train = vectorizer.transform(df.loc[df.set_name == "train", "text"]).toarray()
# X_val = vectorizer.transform(df.loc[df.set_name == "val", "text"]).toarray()
# X_test = vectorizer.transform(df.loc[df.set_name == "test", "text"]).toarray()
# X_other = vectorizer.transform(df_other["text"]).toarray()

y_train = df.loc[df.set_name == "train", "user"]
y_val = df.loc[df.set_name == "val", "user"]
y_test = df.loc[df.set_name == "test", "user"]

char_vectorizer = TfidfVectorizer(analyzer = "char_wb", ngram_range = (4,4), max_features = 5000).fit(df.text.values)
word_vectorizer = TfidfVectorizer(analyzer = "word", ngram_range = (1,1), max_features = 2500).fit(df.text.values)

X_train_char = char_vectorizer.transform(df.loc[df.set_name == "train", "text"]).toarray()
X_val_char = char_vectorizer.transform(df.loc[df.set_name == "val", "text"]).toarray()
X_test_char = char_vectorizer.transform(df.loc[df.set_name == "test", "text"]).toarray()
X_other_char = char_vectorizer.transform(df_other["text"]).toarray()

X_train_word = word_vectorizer.transform(df.loc[df.set_name == "train", "text"]).toarray()
X_val_word = word_vectorizer.transform(df.loc[df.set_name == "val", "text"]).toarray()
X_test_word = word_vectorizer.transform(df.loc[df.set_name == "test", "text"]).toarray()
X_other_word = word_vectorizer.transform(df_other["text"]).toarray()

X_train = np.concatenate((X_train_char, X_train_word), axis=1)
X_val = np.concatenate((X_val_char, X_val_word), axis=1)
X_test = np.concatenate((X_test_char, X_test_word), axis=1)
X_other = np.concatenate((X_other_char, X_other_word), axis=1)


# Save dataset
with open(os.path.join(data_dir, "X_train.npy"), 'wb') as f:
    np.save(f, X_train)
with open(os.path.join(data_dir, "X_val.npy"), 'wb') as f:
    np.save(f, X_val)
with open(os.path.join(data_dir, "X_test.npy"), 'wb') as f:
    np.save(f, X_test)
with open(os.path.join(data_dir, "X_other.npy"), 'wb') as f:
    np.save(f, X_other)

y_train.to_frame().to_csv(os.path.join(data_dir, "y_train.csv"), index=False)
y_val.to_frame().to_csv(os.path.join(data_dir, "y_val.csv"), index=False)
y_test.to_frame().to_csv(os.path.join(data_dir, "y_test.csv"), index=False)

assert len(X_train) == len(y_train)
assert len(X_val) == len(y_val)
assert len(X_test) == len(y_test)

In [5]:
X_train.shape

(3750, 7500)