In [1]:
import numpy as np
import pandas as pd
import json
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
import xgboost as xgb

In [2]:
with open('/Users/nissani/Desktop/Hateful_Memes_Project/data/train.json', 'r') as f:
    train_data = json.load(f)
    
with open('/Users/nissani/Desktop/Hateful_Memes_Project/data/dev.json', 'r') as f:
    dev_data = json.load(f)

In [3]:
columns = []
for el in list(train_data['42953'].keys())[6:]:
    if el != 'emotion_feature':
        columns.append(el)

In [4]:
columns.remove('meme_feature_vector')
columns.remove('tag_feature_vector')
columns.remove('caption_feature_vector')

In [5]:
columns

['meme_hate_speech',
 'meme_offensive_language',
 'meme_neither',
 'caption_hate_speech',
 'caption_offensive_language',
 'caption_neither',
 'tag_hate_speech',
 'tag_offensive_language',
 'tag_neither',
 'meme_sentiment',
 'caption_sentiment',
 'tag_sentiment',
 'protected_meme_scores',
 'protected_caption_scores',
 'all_text_freq',
 'train_text_freq',
 'dev_text_freq',
 'test_text_freq',
 'tags_race',
 'tags_disability',
 'tags_religion',
 'tags_sexual_orientation',
 'tags_violence',
 'tags_criminals',
 'tags_gender',
 'tags_animals',
 'text_race',
 'text_disability',
 'text_religion',
 'text_sexual_orientation',
 'text_violence',
 'text_criminals',
 'text_gender',
 'text_animals']

In [6]:
'''

End goal: array of labels and matrix of data

'''

'\n\nEnd goal: array of labels and matrix of data\n\n'

In [7]:
train_data['42953']['tag_sentiment']

['NEGATIVE', 0.9568]

In [8]:
def sentiment_check(data):
    sentiment_list = data
    sentiment_scores = {}
    if sentiment_list[0] == 'NEGATIVE':
        sentiment_scores['negative'] = sentiment_list[1]
        sentiment_scores['positive'] = 1 - sentiment_list[1]
    elif sentiment_list[0] == 'POSITIVE':
        sentiment_scores['positive'] = sentiment_list[1]
        sentiment_scores['negative'] = 1 - sentiment_list[1]
    else:
        raise Exception("sentiment label not found")
    
    return sentiment_scores

In [9]:
train_data['42953']['protected_meme_scores']

[567.9492130065482,
 546.2337104679173,
 588.6687470631587,
 571.2920274632197,
 555.7372433059063,
 553.8820845654845,
 555.1955790296342,
 588.4936292130878,
 584.0770774830744]

In [10]:
def format_data(data, keys1, keys2):
    labels = []
    feature_vectors = []
    for key1 in keys1:
        labels.append(data[key1]['label'])
        feature_vector = []
        for key2 in keys2:
            if isinstance(data[key1][key2], list):
                if 'sentiment' in key2:
                    sentiment_scores = sentiment_check(data[key1][key2])
                    feature_vector.append(sentiment_scores['negative'])
                    feature_vector.append(sentiment_scores['positive'])
                else:
                    for el in data[key1][key2]:
                        feature_vector.append(el)
            else:
                feature_vector.append(data[key1][key2])
        feature_vectors.append(feature_vector)
        
    return labels, feature_vectors

In [11]:
train_labels, train_feature_vectors = format_data(train_data, list(train_data.keys()), columns)
dev_labels, dev_feature_vectors = format_data(dev_data, list(dev_data.keys()), columns)

In [13]:
len(train_feature_vectors[0])

53

In [27]:
lr = LogisticRegression()
lr.fit(dev_feature_vectors, dev_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
lr.score(train_feature_vectors, train_labels)

0.4536470588235294

In [29]:
lr.score(dev_feature_vectors, dev_labels)

0.7

In [18]:
knn = KNN()

In [19]:
knn.fit(train_feature_vectors, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [20]:
knn.score(dev_feature_vectors, dev_labels)

0.498

In [21]:
model = xgb.XGBClassifier()

In [22]:
model.fit(np.array(train_feature_vectors), np.array(train_labels))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [23]:
model.score(np.array(train_feature_vectors), np.array(train_labels))

0.7412941176470588

In [24]:
model.score(np.array(dev_feature_vectors), np.array(dev_labels))

0.53

In [25]:
def ablation_experiment(train_data, dev_data, keys2, model):
    remaining_keys = keys2.copy()
    while len(remaining_keys) > 0:
        scores = []
        print('getting scores')
        for name in remaining_keys:
            current_keys = remaining_keys.copy()
            current_keys.remove(name)
            print('making keys')
            train_labels, train_feature_vectors = format_data(train_data, list(train_data.keys()), current_keys)
            dev_labels, dev_feature_vectors = format_data(dev_data, list(dev_data.keys()), current_keys)
            print('formatted_data')
            model.fit(np.array(train_feature_vectors), np.array(train_labels))
            scores.append(model.score(np.array(dev_feature_vectors), np.array(dev_labels)))
            print(name)
            print(scores[-1])
        min_score = min(scores)
        print(min_score)
        argmin_score = np.argmin(scores)
        print(remaining_keys[argmin_score])
        remaining_keys.remove(remaining_keys[argmin_score])
    

In [26]:
ablation_experiment(train_data, dev_data, columns, model)

getting scores
making keys
formatted_data
meme_hate_speech
0.528
making keys
formatted_data
meme_offensive_language
0.526
making keys
formatted_data
meme_neither
0.526
making keys
formatted_data
caption_hate_speech
0.53
making keys
formatted_data
caption_offensive_language
0.53
making keys
formatted_data
caption_neither
0.528
making keys
formatted_data
tag_hate_speech
0.526
making keys
formatted_data
tag_offensive_language
0.524
making keys
formatted_data
tag_neither
0.528
making keys
formatted_data
meme_sentiment
0.518
making keys
formatted_data
caption_sentiment
0.528
making keys
formatted_data
tag_sentiment
0.53
making keys
formatted_data


KeyboardInterrupt: 

In [None]:
#remaining_keys=keys2.copy()
#while len(remaining_keys) > 0
#   worst_score=???
#.  associated_key=X
# for i in len(keys2):
#     current_keys=key2.copy()
#     cirrent_keys.remove(key2[i])
#      score=pickle_rick(remaining_keys)
#

In [None]:
#[1,2,3,a,v]
#[1,2,3,4,5]


#t=0
#[1,2,3]
#t=1
#[2,3,a]
#t=2
#