In [None]:
import os
import sys
import json
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import ast
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
sys.path.insert(0, os.path.abspath("../../" + 'src/scraping'))
import metadata as meta
sys.path.insert(0, os.path.abspath("../../" + "src/modeling"))
import basic_stats as basic
import time
from PIL import Image

In [None]:
def get_all_thumbnails(save_dir, chan_ids, master_df):
    for chan_id in chan_ids:
        chan_df = master_df[master_df['channelId'] == chan_id]
        chan_vids = meta.download_df_thumbs(chan_df,save_dir,'medium')
    chan_basic_df = basic.basic_image_stats(save_dir)
    full_chan = master_df.merge(chan_basic_df,on='videoId')
    full_chan = full_chan.drop_duplicates(subset='videoId')
    return full_chan

def get_corr(img_feats, meta_feats,df,sign):
    corrs = []
    for col in meta_feats:
        for col2 in img_feats:
        #     col = 'z_views'
        #     col2 = 'contrast'
            step1 = df[df[col].notnull()]
            filtered = step1[step1[col2].notnull()]
            corr = np.corrcoef(filtered[col],filtered[col2])[0][1]
            corrs.append([col2, col, corr])
#             if sign == "-":
#                 if corr < 0:
#                     print(col, col2, corr)
#                     print('---------------')
#             else:
#                 if corr > 0:
#                     print(col, col2, corr)
#                     print('---------------')
    corr_df = pd.DataFrame(corrs, columns=['img_ft','meta_ft','corr'])
    return corr_df


# plots single emoition vs z views for all specified emotions
def get_emotions_df(emotions,in_df,meta_col):
    emotions_data = []
    for targ in emotions:
        emotion = in_df['emotions'].apply(lambda x: targ in x if type(x) == str else x)
        emotions_data.append(in_df[emotion == True][meta_col].describe())
    #     plt.scatter(emotion,face_data['z_views'])
    #     plt.title(targ + " vs. z_views")
    #     plt.xlabel(targ)
    #     plt.ylabel('z_views')
    #     plt.show()
    emotions_df = pd.DataFrame(emotions_data).reset_index()
    emotions_df['emotion'] = emotions
    return emotions_df
                    
def get_range(col_name, df):
    stats = df[col_name].describe()
    col_range = np.arange(stats['25%'],stats['max'],stats['std']/3)
    return col_range

def tune_params(df, c1, c2, meta_col):
    baseline_stats = df[meta_col].describe()
    base_mean = baseline_stats['mean']
    base_median = baseline_stats['50%']
#     print("baseline:     ",
#           "count:",baseline_stats['count'],
#           "mean:",baseline_stats['mean'].round(3),
#           "median:",baseline_stats['50%'].round(3))
    rel_stats = []
    c1_range = get_range(c1,df)
    c2_range = get_range(c2,df)
    for c1_cutoff in c1_range:
        for c2_cutoff in c2_range:
            filtered = df.apply(lambda x: x[c1] > c1_cutoff and x[c2] > c2_cutoff,axis=1)
            stats = df[filtered][meta_col].describe()
            cur_count = stats['count']
            cur_mean = stats['mean']
            cur_median = stats['50%']
            cur_stats_list = [c1,c1_cutoff,c2,c2_cutoff,cur_count, cur_mean, cur_median]
            rel_stats.append(cur_stats_list)
    res_df = pd.DataFrame(rel_stats, columns=['col1','col1cutoff','col2','col2cutoff','count','mean','median'])
    relevant_df = res_df[res_df['count'] > 0] # gets rid of param combinations with no results
    # filters out param combinations that did worse than the baseline
    better_df = relevant_df[relevant_df.apply(
        lambda x: x['mean'] > base_mean and x['median'] > base_median,axis=1)]
    return better_df

In [None]:
# config variables
emotions = ['angry', 'happy', 'sad', 'fear', 'neutral', 'surprise']
describe_cols = ['mean','std','25%','50%','75%']
numerical_img_feats = ['unique_rgb_ratio','mean_hue',
                       'mean_saturation','mean_brightness', 'contrast',
                       'edge_score','numFaces']
meta_feats = ['viewCount','z_views']
master_df = pd.read_csv("../../data/local/fortnite/video_data/fortnite_master_metadata_updated_facial_features.csv")
save_dir = "../../data/local/fortnite/thumbnails/"
chan_ids = all_meta['channelId'].value_counts().index

In [None]:
# downloads thumbnails not in local storage and computes basic image statistics, takes a decent bit
final_df = get_all_thumbnails(save_dir, chan_ids, master_df)

In [None]:
# different subsets of data
final_df = final_df[final_df['z_views'].notnull()]
face_data = final_df[final_df['numFaces'].notnull()]
no_face_data = final_df[final_df['numFaces'].isnull()]
all_face = final_df.fillna(value={"numFaces":0})
# dataset to use for eda
analysis_df = all_face

In [None]:
# emotion stats for each descriptive stat for each meta column
plt.rcParams['figure.figsize'] = [6, 4]
emotions_df = get_emotions_df(emotions,analysis_df,meta_feats[0])
plt.bar(emotions_df['emotion'],emotions_df['count'],color=['red','green','blue','purple','grey','orange'])
plt.xlabel('emotion')
plt.ylabel('count')
plt.title("Video count per emotion")
plt.show()
for meta_feat in meta_feats:
    emotions_df = get_emotions_df(emotions,analysis_df,meta_feat)
    for col in describe_cols:
        plt.bar(emotions_df['emotion'],emotions_df[col],color=['red','green','blue','purple','grey','orange'])
        plt.xlabel('emotion')
        plt.ylabel(col)
        plt.title(meta_feat + " (" + col + ") per emotion")
        plt.show()

In [None]:
# table of corr coeffecients for convenience
all_corrs = get_corr(numerical_img_feats, meta_feats, analysis_df, '-')
all_corrs

In [None]:
# plot summary of each numerical image feature vs each meta column with correlation in title
for col in numerical_img_feats:
    for meta_col in meta_feats:
        plt.scatter(analysis_df[col],analysis_df[meta_col])
        plt.xlabel(col)
        plt.ylabel(meta_col)
        cur_corr_df = all_corrs[all_corrs.apply(lambda x: x['img_ft'] == col and x['meta_ft'] == meta_col,axis=1)]
        cur_corr = cur_corr_df.iloc[0]['corr']
        plt.title(col + " vs. " + meta_col + " Corr: " + str(cur_corr.round(3)))
        plt.show()

In [None]:
summary_df = pd.DataFrame()
for meta_feat in meta_feats:
    cur_corr_df = all_corrs[all_corrs['meta_ft'] == meta_feat]
    c1, c2 = cur_corr_df.sort_values(by='corr',ascending=False)['img_ft'].iloc[0:2].values
    # this takes a while bc of the many combinations, may re-write the way I get the ranges
    tuned_df = tune_params(analysis_df,c1,c2,meta_feat) 
    tuned_df['mm_sum'] = tuned_df['mean'] + tuned_df['median']
    #old cell break
    top_df = tuned_df.sort_values(by='mm_sum',ascending=False)
    rgb_cutoff = top_df['col1cutoff'].describe()['mean']
    sat_cutoff = top_df['col2cutoff'].describe()['mean']
    top_cut = analysis_df[analysis_df.apply(
        lambda x: x[c1] > rgb_cutoff and x[c2] > sat_cutoff,axis=1)]

    baseline_stats = analysis_df[meta_feat].describe().round(4)
    baseline_stats['type'] = 'baseline'
    baseline_stats['meta_col'] = baseline_stats.name

    top_cut_stats = top_cut[meta_feat].describe().round(4)
    top_cut_stats['type'] = 'top_cut'
    top_cut_stats['meta_col'] = top_cut_stats.name

    cur_summary_df = pd.DataFrame([baseline_stats,top_cut_stats]).reset_index(drop=True)
    summary_df = pd.concat([summary_df,cur_summary_df])

In [None]:
# analysis using threshold cutoff for 2 features with the highest + correlation to the meta_col
# results are stats of the "top cut" that meet said criteria
summary_df

In [None]:
# not adjusted for multiple meta feats
targ_col = 'z_views'
X_train, X_test, y_train, y_test = train_test_split(analysis_df[numerical_img_feats],
                                                    analysis_df[targ_col],
                                                    train_size=.9)
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
print("RF Score:",rf.score(X_test,y_test))
plt.scatter(rf_preds,y_test)
plt.title('RF ' + targ_col + ' Predictions vs Actual')
plt.xlabel('prediction')
plt.ylabel('actual')
plt.show()
# feat_importances = pd.Series(dict(zip(numerical_img_feats,rf.feature_importances_)))
# print("RF Feat Importances:\n",feat_importances.sort_values(ascending=False))
gbr = GradientBoostingRegressor()
gbr.fit(X_train,y_train)
gbr_preds = gbr.predict(X_test)
print("GBR Score:",gbr.score(X_test,y_test))
plt.scatter(gbr_preds,y_test)
plt.title('GBR ' + targ_col + ' Predictions vs Actual')
plt.xlabel('prediction')
plt.ylabel('actual')
plt.show()

In [None]:
high_lvl_df = pd.DataFrame()
for num_feat in numerical_img_feats:
    for num_feat2 in numerical_img_feats:
        for num_feat3 in numerical_img_feats:
            for num_feat4 in numerical_img_feats:
                col_name = num_feat + "/" + num_feat2 + "/" + num_feat3 + "/" + num_feat4 
                high_lvl_df[col_name] = analysis_df[num_feat]*analysis_df[num_feat2]*analysis_df[num_feat3]*analysis_df[num_feat4]
high_lvl_df

In [None]:
high_train, high_test, targ_train, targ_test = train_test_split(high_lvl_df, analysis_df['viewCount'])
lr = LinearRegression()
lr.fit(high_train, targ_train)
lr.score(high_test, targ_test)

In [None]:
lr_preds = lr.predict(high_test)

In [None]:
plt.scatter(lr_preds, targ_test)

In [None]:
analysis_df['has_face'] = analysis_df['numFaces'] > 0
poor_perf = analysis_df[analysis_df['z_views'] < -1]
good_perf = analysis_df[analysis_df['z_views'] > 1]

In [None]:
poor_data = []
good_data = []
for num_feat in numerical_img_feats:
    poor_data.append(poor_perf[num_feat].describe())
    good_data.append(good_perf[num_feat].describe())
poor_stats = pd.DataFrame(poor_data).drop("count",axis=1).T
good_stats = pd.DataFrame(good_data).drop('count',axis=1).T

In [None]:
plt.rcParams['figure.figsize'] = [12, 5]
x = np.arange(len(good_stats.index))  # the label locations
width = 0.35  # the width of the bars
for num_feat in numerical_img_feats:
    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2,good_stats[num_feat],width,label='Good Performance')
    rects2 = ax.bar(x + width/2,poor_stats[num_feat],width,label='Poor Performance')
    ax.set_xticks(x)
    ax.set_xticklabels(good_stats.index)
    ax.set_title(num_feat)
    ax.legend()
    plt.show()

# Old

In [None]:
# all_meta = pd.read_csv("../../data/local/fortnite/video_data/fortnite_master_metadata.csv")

In [None]:
# big_df = pd.read_csv('../../data/local/fortnite/video_data/merged_df.csv').drop("Unnamed: 0",axis=1)

In [None]:
# no_faces = final_df[final_df['numFaces'].isnull()]['videoId']
# faces = final_df[final_df['numFaces'].notnull()]['videoId']

In [None]:
# for i in np.random.choice(len(faces),size=20):
#     display(Image.open("../../data/local/fortnite/thumbnails/" + faces.iloc[i] + ".jpg"))
#     print(final_df.iloc[i]['numFaces'])

In [None]:
# for i in np.random.choice(len(no_faces),size=20):
#     display(Image.open("../../data/local/fortnite/thumbnails/" + no_faces.iloc[i] + ".jpg"))

In [None]:
# emotion = sorted_df['emotions'].apply(lambda x: len(ast.literal_eval(x)) if type(x) == str else 0)
# # print("Emotion:",targ)
# display(emotion.describe())
# plt.scatter(emotion,sorted_df['z_views'])
# # plt.show()

In [None]:
# numerical_img_feats = ['unique_rgb_ratio','num_rgb','mean_hue',
#                        'mean_saturation','mean_brightness', 'contrast',
#                        'edge_score','numFaces']
# meta_feats = ['z_views']

In [None]:
# get_corr(numerical_img_feats, meta_feats, face_data, '-')

In [None]:
# get_corr(numerical_img_feats, meta_feats, face_data, '+')

In [None]:
# c1 = 'num_rgb'
# c2 = 'edge_score'

In [None]:
# tuned_df = tune_params(face_data,c1,c2)
# tuned_df['mm_sum'] = tuned_df['mean'] + tuned_df['median']

In [None]:
# top_df = tuned_df.sort_values(by='mm_sum',ascending=False)
# rgb_cutoff = top_df['col1cutoff'].describe()['mean']
# sat_cutoff = top_df['col2cutoff'].describe()['mean']
# # face_data['z_views'].describe()
# top_cut = face_data[face_data.apply(
#     lambda x: x[c1] > rgb_cutoff and x[c2] > sat_cutoff,axis=1)]
# top_cut['z_views'].describe()

In [None]:
# top_200 = pd.read_csv('../../data/local/fortnite/video_data/merged_df.csv').drop("Unnamed: 0",axis=1)
# full_views = top_200[top_200['z_views'].notnull()]

In [None]:
# rf = RandomForestRegressor()
# rf.fit(full_views[basic_img_feats],full_views[['position']])
# rf.feature_importances_

In [None]:
# baseline_stats = sorted_df['z_views'].describe()
# print("baseline:     ",
#       "count:",baseline_stats['count'],
#       "mean:",baseline_stats['mean'].round(3),
#       "median:",baseline_stats['50%'].round(3))
# # print('---------------------------------------')
# #['baseline',0,baseline_stats['count'],baseline_stats['mean'],baseline_stats['50%']]
# rel_stats = []
# c1 = 'num_rgb'
# c1_range = get_range(c1,sorted_df)
# c2 = 'mean_saturation'
# c2_range = get_range(c2, sorted_df)
# for c1_cutoff in c1_range:
#     for c2_cutoff in c2_range:
#         filtered = sorted_df.apply(lambda x: x[c1] > c1_cutoff and x[c2] > c2_cutoff,axis=1)
#         stats = sorted_df[filtered]['z_views'].describe()
#         cur_count = stats['count']
#         cur_mean = stats['mean']
#         cur_median = stats['50%']
#         cur_stats_list = [c1,c1_cutoff,c2,c2_cutoff,cur_count, cur_mean, cur_median]
#         rel_stats.append(cur_stats_list)
# res_df = pd.DataFrame(rel_stats, columns=['col1','col1cutoff','col2','col2cutoff','count','mean','median'])

In [None]:
# not_null_df = res_df[res_df['mean'].notnull()]
# not_null_df[not_null_df.apply(lambda x: x['mean'] > .5 and x['median']> .2,axis=1)]

In [None]:
# c1 = 'mean_saturation'
# c1_stats = sorted_df[c1].describe()
# c1_range = np.arange(c1_stats['25%'],c1_stats['75%'],c1_stats['std']/3)
# c1_range

In [None]:
# col = 'contrast'
# print(sorted_df[col].describe())
# plt.scatter(sorted_df['contrast'],sorted_df['z_views'])

In [None]:
# for vid_id in basic_stats_df['videoId'].values:
#     if vid_id not in df['videoId'].values:
#         print(vid_id)

In [None]:
# merged_df = df.merge(basic_stats_df,how="left",on="videoId")

In [None]:
# datetime.now().strftime("_%m_%d_%y")