In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.2.0-py3-none-macosx_10_13_x86_64.macosx_10_14_x86_64.macosx_10_15_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 1.1 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.2.0


In [7]:
import numpy as np
import pandas as pd
import json
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
import xgboost as xgb

In [6]:
with open('/Users/forrest.xiao/projects/Hateful_Memes_Project/data/train.json', 'r') as f:
    train_data = json.load(f)
    
with open('/Users/forrest.xiao/projects/Hateful_Memes_Project/data/dev.json', 'r') as f:
    dev_data = json.load(f)

In [8]:
columns = []
for el in list(train_data['42953'].keys())[6:]:
    if el != 'emotion_feature':
        columns.append(el)

In [11]:
# columns.remove('meme_feature_vector')
# columns.remove('tag_feature_vector')
# columns.remove('caption_feature_vector')

In [12]:
columns

['meme_hate_speech',
 'meme_offensive_language',
 'meme_neither',
 'caption_hate_speech',
 'caption_offensive_language',
 'caption_neither',
 'tag_hate_speech',
 'tag_offensive_language',
 'tag_neither',
 'meme_sentiment',
 'caption_sentiment',
 'tag_sentiment',
 'tag_feature_vector',
 'caption_feature_vector',
 'meme_feature_vector',
 'protected_meme_scores',
 'protected_caption_scores',
 'all_text_freq',
 'train_text_freq',
 'dev_text_freq',
 'test_text_freq',
 'tags_race',
 'tags_disability',
 'tags_religion',
 'tags_sexual_orientation',
 'tags_violence',
 'tags_criminals',
 'tags_gender',
 'tags_animals',
 'text_race',
 'text_disability',
 'text_religion',
 'text_sexual_orientation',
 'text_violence',
 'text_criminals',
 'text_gender',
 'text_animals']

In [13]:
'''

End goal: array of labels and matrix of data

'''

'\n\nEnd goal: array of labels and matrix of data\n\n'

In [14]:
train_data['42953']['tag_sentiment']

['NEGATIVE', 0.9568]

In [15]:
def sentiment_check(data):
    sentiment_list = data
    sentiment_scores = {}
    if sentiment_list[0] == 'NEGATIVE':
        sentiment_scores['negative'] = sentiment_list[1]
        sentiment_scores['positive'] = 1 - sentiment_list[1]
    elif sentiment_list[0] == 'POSITIVE':
        sentiment_scores['positive'] = sentiment_list[1]
        sentiment_scores['negative'] = 1 - sentiment_list[1]
    else:
        raise Exception("sentiment label not found")
    
    return sentiment_scores

In [16]:
train_data['42953']['protected_meme_scores']

[567.9492130065482,
 546.2337104679173,
 588.6687470631587,
 571.2920274632197,
 555.7372433059063,
 553.8820845654845,
 555.1955790296342,
 588.4936292130878,
 584.0770774830744]

In [17]:
def format_data(data, keys1, keys2):
    labels = []
    feature_vectors = []
    for key1 in keys1:
        labels.append(data[key1]['label'])
        feature_vector = []
        for key2 in keys2:
            if isinstance(data[key1][key2], list):
                if 'sentiment' in key2:
                    sentiment_scores = sentiment_check(data[key1][key2])
                    feature_vector.append(sentiment_scores['negative'])
                    feature_vector.append(sentiment_scores['positive'])
                else:
                    for el in data[key1][key2]:
                        feature_vector.append(el)
            else:
                feature_vector.append(data[key1][key2])
        feature_vectors.append(feature_vector)
        
    return labels, feature_vectors

In [21]:
columns=['tag_feature_vector',
 'caption_feature_vector',
 'meme_feature_vector']

train_labels, train_feature_vectors = format_data(train_data, list(train_data.keys()), columns)
dev_labels, dev_feature_vectors = format_data(dev_data, list(dev_data.keys()), columns)

In [22]:
len(train_feature_vectors[0])

1836

In [105]:
def test_models(train_feature_vectors, train_labels, dev_feature_vectors, dev_labels):
    lr = LogisticRegression()
    lr.fit(train_feature_vectors, train_labels)
    lr_train_score=lr.score(train_feature_vectors, train_labels)
    lr_dev_score=lr.score(dev_feature_vectors, dev_labels)
    
    print("LR Train: ", lr_train_score)
    print("LR Dev: ", lr_dev_score)
    
    knn = KNN()
    knn.fit(train_feature_vectors, train_labels)
    knn_train_score=knn.score(train_feature_vectors, train_labels)
    knn_dev_score=knn.score(dev_feature_vectors, dev_labels)
    print("KNN Train: ", knn_train_score)
    print("KNN Dev: ", knn_dev_score)    
    
    model = xgb.XGBClassifier()
    model.fit(np.array(train_feature_vectors), np.array(train_labels))
    xgb_train_score=model.score(np.array(train_feature_vectors), np.array(train_labels))
    xgb_dev_score=model.score(np.array(dev_feature_vectors), np.array(dev_labels))
    
    print("XGB Train: ", xgb_train_score)
    print("XGB Dev: ", xgb_dev_score)
    
    lr_pred=lr.predict(dev_feature_vectors)
    knn_pred=knn.predict(dev_feature_vectors)
    xgb_pred=model.predict(np.array(dev_feature_vectors))
    
    lr_pred_df=pd.DataFrame(lr_pred)
    print("LR Distribution: ", lr_pred_df[0].value_counts(normalize=True))
    
    knn_pred_df=pd.DataFrame(knn_pred)
    print("KNN Distribution: ", knn_pred_df[0].value_counts(normalize=True))    
    
    xgb_pred_df=pd.DataFrame(xgb_pred)
    print("XGB Distribution: ", xgb_pred_df[0].value_counts(normalize=True))    
    
    
    lr_knn=accuracy(lr_pred, knn_pred)
    print("LR/KNN Accuracy: ",lr_knn)
    
    xgb_knn=accuracy(xgb_pred, knn_pred)
    print("XGB/KNN Accuracy: ",xgb_knn)
        
    lr_xgb=accuracy(xgb_pred, lr_pred)
    print("LR/XGB Accuracy: ",lr_xgb)
    
    return lr, knn, xgb, lr_pred, knn_pred, xgb_pred

In [106]:
# Baseline models with raw embeddings
lr, knn, xgb, lr_pred, knn_pred, xgb_pred=test_models(train_feature_vectors, train_labels, dev_feature_vectors, dev_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LR Train:  0.7889411764705883
LR Dev:  0.56
KNN Train:  0.7385882352941177
KNN Dev:  0.564
XGB Train:  0.9978823529411764
XGB Dev:  0.546
LR Distribution:  0    0.724
1    0.276
Name: 0, dtype: float64
KNN Distribution:  0    0.688
1    0.312
Name: 0, dtype: float64
XGB Distribution:  0    0.858
1    0.142
Name: 0, dtype: float64
LR/KNN Accuracy:  0.644
XGB/KNN Accuracy:  0.702
LR/XGB Accuracy:  0.77


In [110]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

eclf1 = VotingClassifier(estimators=[('lr', lr), ('knn', knn)], voting='hard')
eclf1 = eclf1.fit(train_feature_vectors, train_labels)
ens_pred=eclf1.predict(train_feature_vectors)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [113]:
print("dev:",eclf1.score(dev_feature_vectors, dev_labels))

dev: 0.56


In [111]:
print(accuracy(ens_pred, train_labels))

0.7692941176470588


In [114]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

eclf2 = VotingClassifier(estimators=[('lr', lr), ('knn', knn)], voting='soft')
eclf2 = eclf2.fit(train_feature_vectors, train_labels)
ens_pred_2=eclf2.predict(train_feature_vectors)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [115]:
print("soft dev:",eclf2.score(dev_feature_vectors, dev_labels))

soft dev: 0.564


In [116]:
print(accuracy(ens_pred_2, train_labels))

0.810235294117647


In [117]:
new_columns= ['caption_feature_vector','meme_feature_vector']

train_labels, train_feature_dif = format_data(train_data, list(train_data.keys()), new_columns)
dev_labels, dev_feature_dif = format_data(dev_data, list(dev_data.keys()), new_columns)

vector1=np.array(train_feature_dif)[:,:768]
vector2=np.array(train_feature_dif)[:,768:]

train_feature_dif_vectors=vector1-vector2


vector1=np.array(dev_feature_dif)[:,:768]
vector2=np.array(dev_feature_dif)[:,768:]

dev_feature_vectors_dif=vector1-vector2
#train_feature_vectors[0])


In [118]:
lrd, knnd, xgbd, lr_predd, knn_predd, xgb_predd=test_models(train_feature_dif_vectors, train_labels, dev_feature_vectors_dif, dev_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LR Train:  0.7345882352941177
LR Dev:  0.538
KNN Train:  0.7325882352941177
KNN Dev:  0.526
XGB Train:  0.9977647058823529
XGB Dev:  0.556
LR Distribution:  0    0.746
1    0.254
Name: 0, dtype: float64
KNN Distribution:  0    0.838
1    0.162
Name: 0, dtype: float64
XGB Distribution:  0    0.804
1    0.196
Name: 0, dtype: float64
LR/KNN Accuracy:  0.708
XGB/KNN Accuracy:  0.782
LR/XGB Accuracy:  0.774


In [120]:
print("LR/LR Diff Accuracy: ", accuracy(lr_predd, lr_pred))
print("KNN/KNN Diff Accuracy: ", accuracy(knn_predd, knn_pred))
print("XKB/XKB Diff Accuracy: ", accuracy(xgb_predd, xgb_predd))

LR/LR Diff Accuracy:  0.83
KNN/KNN Diff Accuracy:  0.738
XKB/XKB Diff Accuracy:  1.0


In [100]:
# Baseline models with PCA'd features

from sklearn.decomposition import PCA
pca=PCA(n_components=500, whiten=True)
len(train_feature_vectors)

demeaned_train_feature_vectors=train_feature_vectors-np.array(train_feature_vectors).mean()
pca_train_vectors=pca.fit_transform(demeaned_train_feature_vectors, y=None)


demeaned_dev_feature_vectors=dev_feature_vectors-np.array(dev_feature_vectors).mean()
pca_dev_vectors=pca.fit_transform(demeaned_dev_feature_vectors, y=None)

In [101]:
lr_pred, knn_pred, xgb_pred=test_models(pca_train_vectors, train_labels, pca_dev_vectors, dev_labels)

LR Train:  0.7370588235294118
LR Dev:  0.504
KNN Train:  0.7109411764705882
KNN Dev:  0.476
XGB Train:  0.9978823529411764
XGB Dev:  0.502
LR Distribution:  0    0.724
1    0.276
Name: 0, dtype: float64
KNN Distribution:  1    0.536
0    0.464
Name: 0, dtype: float64
XGB Distribution:  0    0.806
1    0.194
Name: 0, dtype: float64
LR/KNN Accuracy:  0.532
XGB/KNN Accuracy:  0.518
LR/XGB Accuracy:  0.754
