# Modeling "virality" of TED Talks

This file uses a sequence of ensemble models to systematically shrink the sample of 2,374 TED talks down to a smaller sample of talks that get the most views. Each individual layer, or "round", is designed to predict whether a talk is above or below the sample median. In each round, talks that are predicted to be above the median advance to the next round, others are "eliminated", a new median is calculated, and the process repeats itself as many times as necessary. Each round tests 6 different classification algorithms and chooses up to 3 that do not overfit. 

In [None]:
import pandas as pd
import sys
import pickle
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# LOAD
file = 'df_text'
f = open(file,'rb') 
df = pickle.load(f) 
f.close()

In [None]:
list(df.columns.values)

In [None]:
# Keep only relevant variables
df = df[['id', 'speaker', 'URL', 'time', 'technology', 'science', 'global', 'design', 'issues', 'culture', 'business', 
          'entertainment', 'change', 'art', 'biology', 'innovation', 'education', 'society', 'communication', 
          'politics', 'future', 'music', 'log_views', 'laughter_n', 'applause_n', 'questions_n', 'stories_n', 
          'talking_speed', 'laughter_speed', 'sentence_length', 'year', 'polarity', 
          'sentiment_range', 'sentiment_std', 'cluster_num0', 'cluster_num1', 'cluster_num2',  'cluster_num3', 
         'cluster_num4', 'cluster_num5', 'Spring', 'Summer', 'Winter', 'exclamation_n', 
         'he', 'she', 'he_she', 'self', 'we', 'you', 'filler', 
         'adjectives_percent', 'verbs_percent', 'nouns_percent', 'views']]

# Exploratory data analysis

In [None]:
df.views.mean()

In [None]:
plt.hist(df.views)

In [None]:
df.sort_values('views', ascending = False)[['speaker', 'views']].head(25)

Top 1% will be used as a definition of "viral," although the threshold doesn't affect the modeling process described above.

In [None]:
df['true_viral'] = df.log_views.apply(lambda x: 1 if x>16.36 else 0)
print(df.true_viral.value_counts())

In [None]:
df[df.true_viral == 1]

Explore how characteristics of top 1% differ from the remaining 99%

In [None]:
df.groupby('true_viral')['technology', 'science', 'global', 'design', 'issues', 'culture', 'business', 
                    'entertainment', 'change', 'art', 'biology', 'innovation', 'education', 'society'].mean()

In [None]:
df.groupby('true_viral')['communication', 'politics', 'future', 'music', 'cluster_num0', 'cluster_num1', 'cluster_num2', 
          'cluster_num3', 'cluster_num4', 'cluster_num5'].mean()

In [None]:
df.groupby('true_viral')['laughter_n', 'applause_n', 'questions_n', 'stories_n', 'exclamation_n', 
                         'talking_speed', 'laughter_speed', 'sentence_length', 'polarity', 'sentiment_range', 
                         'sentiment_std'].mean()

In [None]:
df.groupby('true_viral')['he', 'she', 'he_she', 'self', 'you', 'we', 'filler', 'adjectives_percent', 'verbs_percent', 'nouns_percent' ,
                         'Spring', 'Summer', 'Winter', 'year'].mean()

Drop variables that are never present in viral talks - art, music, and future

In [None]:
df.drop(['art', 'music', 'future'], axis=1, inplace=True)

In [None]:
features = ['technology', 'science', 'global', 'design', 'issues', 'culture', 'business', 'entertainment', 
            'change', 'biology', 'innovation', 'education', 'society', 'communication', 'politics', 
            'cluster_num1', 'cluster_num2', 'cluster_num3', 'cluster_num4', 
            'cluster_num5', 'laughter_n', 'applause_n', 'questions_n', 'stories_n', 'exclamation_n', 
            'sentence_length', 'polarity', 'sentiment_range', 'sentiment_std', 
            'he_she', 'self', 'you', 'we', 'filler', 'adjectives_percent', 'verbs_percent', 'nouns_percent' ,
            'Spring', 'Summer', 'Winter', 'year']  

# Functions

In [None]:
# Generate definition of viral
# NOTE: "Viral" is used loosely to represent above or below the median. 
def viral(data):
    views_median = data.log_views.median()
    print(views_median)
    data['viral'] = data.log_views.apply(lambda x: 1 if x>views_median else 0)
    print(data.viral.value_counts())
    
def new_viral(data):
    global new_df
    new_df = data[data.y_pred == 1]
    viral(new_df)

In [None]:
# EDA done at each stage to exmaine whether features should be dropped
def viral_corr(var, data):
    midpoint = len(features) // 2
    print(data.groupby(var)[features[:midpoint]].mean())
    print(data.groupby(var)[features[midpoint:]].mean())

In [None]:
# Helper function to use in general classifier
def plot_confusion_matrix(cm, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# General classifer function
def train_score(classifier, x, y, test_size):
    mm = MinMaxScaler()
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(x, y, test_size=test_size, random_state=1234)
    xtrain = mm.fit_transform(xtrain)
    xtest = mm.transform(xtest)
    ytrain = np.ravel(ytrain)    
    clf = classifier.fit(xtrain, ytrain)    
    
    # score the model (accuracy)   
    train_acc = clf.score(xtrain, ytrain)
    test_acc = clf.score(xtest, ytest)
    
    print("Training Data Accuracy: %0.2f" %(train_acc))
    print("Test Data Accuracy:     %0.2f" %(test_acc))
    
    # create a confusion matrix
    y_true = ytest
    y_pred = clf.predict(xtest)   
    conf = confusion_matrix(y_true, y_pred)
    print ('\n')
    print(conf)

    print ('\n')
    print ("Precision:              %0.2f" %(conf[0, 0] / (conf[0, 0] + conf[1, 0])))
    print ("Recall:                 %0.2f"% (conf[0, 0] / (conf[0, 0] + conf[0, 1])))
    
    cm = confusion_matrix(y_true, y_pred, labels=None)

    # plot the confusion matrix    
    print ('\n')
    plt.figure()
    plot_confusion_matrix(cm)
    
    # ROC curve
    y_score = clf.predict_proba(xtest)[:,1]
    fpr, tpr, thresholds = roc_curve(ytest, y_score)
    roc_auc = auc(fpr, tpr)

    print('AUC: ', roc_auc)    

    plt.figure()
    plt.plot([0,1],[0,1]) # this is our baseline
    plt.plot(fpr, tpr) # this is our ROC curve
    plt.xlabel('FPR')
    plt.ylabel('TPR')

In [None]:
def ensemble (data, classifier1, classifier2, classifier3, threshold):
    models = {'gbt': model_gbt,
              'ada': model_ada,
              'rf': model_rf,
              'ext': model_ext,
              'lr': rfe_lr,
              'bay': rfe_Bayes}
    
    abbrev1 = 'y_score_' + classifier1
    model = models[classifier1]
    model.fit(X,y)
    data[abbrev1] = model.predict_proba(X)[:,1]

    abbrev2 = 'y_score_' + classifier2
    model = models[classifier2]
    model.fit(X,y)
    data[abbrev2] = model.predict_proba(X)[:,1]
    
    abbrev3 = 'y_score_' + classifier3
    model = models[classifier3]
    model.fit(X,y)
    data[abbrev3] = model.predict_proba(X)[:,1]
    
    data['y_pred'] = 0
    data.loc[(data[abbrev1] >= threshold) | (data[abbrev2] >= threshold) | (data[abbrev3] >= threshold), 'y_pred'] = 1
    
    conf_viral = confusion_matrix(data['viral'], data['y_pred'])
    conf_true = confusion_matrix(data['true_viral'], data['y_pred'])
    for x in [conf_viral, conf_true]:
        print (x)
        print ("Precision:              %0.2f" %(x[1, 1] / (x[1, 1] + x[0, 1])))
        print ("Recall:                 %0.2f"% (x[1, 1] / (x[1, 1] + x[1, 0])))
        print ("Accuracy:               %0.2f"% ((x[1, 1] + x[0, 0]) / (x[0, 0] + x[1, 1] + x[1, 0] + x[0, 1])))

# Classification - Round 1

In [None]:
from itertools import product
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel

from sklearn.preprocessing import MinMaxScaler
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split

from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc

from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import binarize

In [None]:
viral(df)

In [None]:
viral_corr('viral', df)

In [None]:
y = df['viral']
X = df[features]

In [None]:
len(features)

### Logistic regression

In [None]:
model_lr = LogisticRegression()

rfe_lr = RFE(model_lr, 41)
train_score(rfe_lr, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe_lr.ranking_), features)))

### Naive Bayes

In [None]:
model_Bayes = naive_bayes.BernoulliNB()
rfe_Bayes = RFE(model_Bayes, 41)
train_score(rfe_Bayes, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe_Bayes.ranking_), features)))

### Random forests

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400], 'n_jobs': [-1]}
model_rf = GridSearchCV(RandomForestClassifier(max_features = 30, max_depth = 3, random_state = 1234), 
                        param_grid=param_grid, 
                        cv=10, 
                        scoring='recall')
train_score(model_rf, X, y, 0.15)
print (model_rf.best_params_)

In [None]:
model_rf = RandomForestClassifier(max_features = 30, max_depth = 3, random_state = 1234, 
                        n_estimators = 100,
                        n_jobs = -1)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.15)

model_rf.fit(xtrain, ytrain)

importances = model_rf.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

### Extra Trees Classifier

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ext = GridSearchCV(ExtraTreesClassifier(max_features = 30, max_depth = 3, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ext, X, y, 0.2)
print (model_ext.best_params_)

In [None]:
model_ext = ExtraTreesClassifier(max_features = 30, max_depth = 3, random_state = 1234, 
                        n_estimators = 300,
                        n_jobs = -1)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

model_ext.fit(xtrain, ytrain)

importances = model_ext.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

### Gradient boosting

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_gbt = GridSearchCV(GradientBoostingClassifier(max_features = 15, max_depth = 2, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_gbt, X, y, 0.15)

### AdaBoost

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ada = GridSearchCV(AdaBoostClassifier(random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ada, X, y, 0.2)

### Ensemble

In [None]:
ensemble(df, 'rf', 'bay', 'lr', 0.5)

In [None]:
# Print out those who were missed
df.loc[(df['true_viral'] == 1.0) & (df['y_pred'] == 0.0)]

In [None]:
df['y_score_mean'] = (df['y_score_ext'] + df['y_score_lr'] + df['y_score_bay']) / 3
top_percent = df[['speaker', 'y_score_mean', 'true_viral']].sort_values('y_score_mean', ascending = False).head(24)
top_percent.true_viral.value_counts()

In [None]:
round1_status = df[['id', 'speaker', 'true_viral', 'y_pred', 'views', 'log_views']]
round1_status.rename(columns={'y_pred': 'round1_pred'}, inplace=True)

In [None]:
round1 = df

# Round 2: Repeat on those predicted to be "viral" in Round 1

Each round follows the same sequence of algorithms, using the sample of talks remaining from the previous round.

In [None]:
new_viral(round1)

In [None]:
y = new_df['viral']

In [None]:
y.value_counts()

In [None]:
viral_corr('viral', new_df)

In [None]:
for i in ['politics']:
    if i in features:
        features.remove(i)

In [None]:
X = new_df[features]
len(features)

In [None]:
model_lr = LogisticRegression()
rfe_lr = RFE(model_lr, 28)
train_score(rfe_lr, X, y, 0.20)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe_lr.ranking_), features)))

In [None]:
model_Bayes = naive_bayes.BernoulliNB()
rfe_Bayes = RFE(model_Bayes, 20)
train_score(rfe_Bayes, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe_Bayes.ranking_), features)))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400], 'n_jobs': [-1]}
model_rf = GridSearchCV(RandomForestClassifier(max_features = 25, max_depth = 2, random_state = 1234), 
                        param_grid=param_grid, 
                        cv=10, 
                        scoring='recall')
train_score(model_rf, X, y, 0.2)
print (model_rf.best_params_)

In [None]:
model_rf = RandomForestClassifier(max_features = 25, max_depth = 2, random_state = 1234, 
                        n_estimators = 200,
                        n_jobs = -1)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.15)

model_rf.fit(xtrain, ytrain)

importances = model_rf.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ext = GridSearchCV(ExtraTreesClassifier(max_features = 18, max_depth = 2, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ext, X, y, 0.15)
print (model_ext.best_params_)

In [None]:
model_ext = ExtraTreesClassifier(max_features = 18, max_depth = 2, random_state = 1234, 
                        n_estimators = 100,
                        n_jobs = -1)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.15)

model_ext.fit(xtrain, ytrain)

importances = model_ext.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_gbt = GridSearchCV(GradientBoostingClassifier(max_features = 18, max_depth = 2, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_gbt, X, y, 0.2)

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ada = GridSearchCV(AdaBoostClassifier(random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ada, X, y, 0.2)

In [None]:
ensemble(new_df, 'ext', 'lr', 'rf', 0.5)

In [None]:
new_df.loc[(new_df['true_viral'] == 1.0) & (new_df['y_pred'] == 0.0)]

In [None]:
new_df['y_score_mean'] = (new_df['y_score_ext'] + new_df['y_score_bay'] ) / 2
top_percent = new_df[['speaker', 'y_score_mean', 'true_viral']].sort_values('y_score_mean', ascending = False).head(24)
top_percent.true_viral.value_counts()

In [None]:
round2_status = new_df[['id', 'y_pred']]
round2_status.rename(columns={'y_pred': 'round2_pred'}, inplace=True)

rounds1_2 = pd.merge(round1_status, round2_status, on='id', how='outer').fillna(value = 0)

In [None]:
round2 = new_df

# Round 3

In [None]:
new_viral(round2)

In [None]:
y = new_df['viral']

In [None]:
y.value_counts()

In [None]:
viral_corr('viral', new_df)

In [None]:
for i in ['change']:
    features.remove(i)

In [None]:
X = new_df[features]
len(features)

In [None]:
model_lr = LogisticRegression()
rfe_lr = RFE(model_lr, 25)
train_score(rfe_lr, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe_lr.ranking_), features)))

In [None]:
model_Bayes = naive_bayes.BernoulliNB()
rfe_Bayes = RFE(model_Bayes, 30)
train_score(rfe_Bayes, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe_Bayes.ranking_), features)))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400], 'n_jobs': [-1]}
model_rf = GridSearchCV(RandomForestClassifier(max_features = 30, max_depth = 1, random_state = 1234), 
                        param_grid=param_grid, 
                        cv=10, 
                        scoring='recall')
train_score(model_rf, X, y, 0.3)
print(model_rf.best_params_)

In [None]:
model_rf = RandomForestClassifier(max_features = 30, max_depth = 1, random_state = 1234, 
                        n_estimators = 200,
                        n_jobs = -1)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state = 1234)

model_rf.fit(xtrain, ytrain)

importances = model_rf.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ext = GridSearchCV(ExtraTreesClassifier(max_features = 12, max_depth = 2, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ext, X, y, 0.3)
print(model_ext.best_params_)

In [None]:
model_ext = ExtraTreesClassifier(max_features = 12, max_depth = 2, random_state = 1234, 
                        n_estimators = 100,
                        n_jobs = -1)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)

model_ext.fit(xtrain, ytrain)

importances = model_ext.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_gbt = GridSearchCV(GradientBoostingClassifier(max_features = 20, max_depth = 1, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_gbt, X, y, 0.3)
print(model_gbt.best_params_)

In [None]:
model_gbt = GradientBoostingClassifier(max_features = 20, max_depth = 1, random_state = 1234, 
                        n_estimators = 200)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state = 1234)

model_gbt.fit(xtrain, ytrain)

importances = model_gbt.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ada = GridSearchCV(AdaBoostClassifier(random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ada, X, y, 0.2)

In [None]:
ensemble(new_df, 'gbt', 'ext', 'lr', 0.5)

In [None]:
new_df['y_score_mean'] = (new_df['y_score_rf'])
top_percent = new_df[['speaker', 'y_score_mean', 'true_viral']].sort_values('y_score_mean', ascending = False).head(24)
top_percent.true_viral.value_counts()

In [None]:
round3_status = new_df[['id', 'y_pred']]
round3_status.rename(columns={'y_pred': 'round3_pred'}, inplace=True)

rounds123 = pd.merge(rounds1_2, round3_status, on='id', how='outer').fillna(value = 0)

In [None]:
round3 = new_df

# Round 4

In [None]:
new_viral(round3)

In [None]:
y = new_df['viral']
y.value_counts()

In [None]:
viral_corr('viral', new_df)

In [None]:
for i in ['design']:
    if i in features:
        features.remove(i)

In [None]:
len(features)

In [None]:
X = new_df[features]

In [None]:
model_lr = LogisticRegression()
rfe = RFE(model_lr, 25)
train_score(rfe, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), features)))

In [None]:
model_Bayes = naive_bayes.BernoulliNB()
rfe = RFE(model_Bayes, 15)
train_score(rfe, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), features)))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400], 'n_jobs': [-1]}
model_rf = GridSearchCV(RandomForestClassifier(max_features = 30, max_depth = 1, random_state = 1234), 
                        param_grid=param_grid, 
                        cv=10, 
                        scoring='recall')
train_score(model_rf, X, y, 0.30)
print(model_rf.best_params_)

In [None]:
model_rf = RandomForestClassifier(max_features = 30, max_depth = 1, random_state = 1234, 
                        n_estimators = 300,
                        n_jobs = -1)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state = 1234)

model_rf.fit(xtrain, ytrain)

importances = model_rf.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ext = GridSearchCV(ExtraTreesClassifier(max_features = 20, max_depth = 3, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ext, X, y, 0.3)
print(model_ext.best_params_)

In [None]:
model_ext = ExtraTreesClassifier(max_features = 20, max_depth = 3, random_state = 1234, 
                        n_estimators = 100,
                        n_jobs = -1)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state = 1234)

model_ext.fit(xtrain, ytrain)

importances = model_ext.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_gbt = GridSearchCV(GradientBoostingClassifier(max_features = 25, max_depth = 1, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_gbt, X, y, 0.3)
print(model_gbt.best_params_)

In [None]:
model_gbt = GradientBoostingClassifier(max_features = 25, max_depth = 1, random_state = 1234, 
                        n_estimators = 100)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state = 1234)

model_gbt.fit(xtrain, ytrain)

importances = model_gbt.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ada = GridSearchCV(AdaBoostClassifier(random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ada, X, y, 0.2)

In [None]:
ensemble(new_df, 'lr', 'gbt', 'gbt', 0.5)

In [None]:
new_df.loc[(new_df['true_viral'] == 1.0) & (new_df['y_pred'] == 0.0)]

In [None]:
new_df['y_score_mean'] = (new_df['y_score_ext'] + new_df['y_score_lr'] ) / 2
top_percent = new_df[['speaker', 'y_score_mean', 'true_viral']].sort_values('y_score_mean', ascending = False).head(24)
top_percent.true_viral.value_counts()

In [None]:
round4_status = new_df[['id', 'y_pred']]
round4_status.rename(columns={'y_pred': 'round4_pred'}, inplace=True)

rounds1234 = pd.merge(rounds123, round4_status, on='id', how='outer').fillna(value = 0)

In [None]:
round4 = new_df

# Round 5

In [None]:
new_viral(round4)

In [None]:
y = new_df['viral']
y.value_counts()

In [None]:
viral_corr('viral', new_df)

In [None]:
for i in ['innovation', 'society']:
    if i in features:
        features.remove(i)

In [None]:
X = new_df[features]
len(features)

In [None]:
model_lr = LogisticRegression()
rfe = RFE(model_lr, 10)
train_score(rfe, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), features)))

In [None]:
model_Bayes = naive_bayes.BernoulliNB()
rfe = RFE(model_Bayes, 20)
train_score(rfe, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), features)))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400], 'n_jobs': [-1]}
model_rf = GridSearchCV(RandomForestClassifier(max_features = 25, max_depth = 1, random_state = 1234), 
                        param_grid=param_grid, 
                        cv=10, 
                        scoring='recall')
train_score(model_rf, X, y, 0.3)
print(model_rf.best_params_)

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ext = GridSearchCV(ExtraTreesClassifier(max_features = 30, max_depth = 1, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ext, X, y, 0.2)
print(model_ext.best_params_)

In [None]:
model_ext = ExtraTreesClassifier(max_features = 30, max_depth = 1, random_state = 1234, 
                        n_estimators = 400,
                        n_jobs = -1)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state = 1234)

model_ext.fit(xtrain, ytrain)

importances = model_ext.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:10]:
    print (features[i], round(importances[i], 3))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_gbt = GridSearchCV(GradientBoostingClassifier(max_features = 10, max_depth = 1, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_gbt, X, y, 0.3)

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ada = GridSearchCV(AdaBoostClassifier(random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ada, X, y, 0.2)

In [None]:
ensemble(new_df, 'bay', 'ext', 'bay', 0.5)

In [None]:
new_df.loc[(new_df['true_viral'] == 1.0) & (new_df['y_pred'] == 0.0)]

In [None]:
new_df['y_score_mean'] = (new_df['y_score_ext'] + new_df['y_score_lr'] + new_df['y_score_rf'] ) / 3
top_percent = new_df[['speaker', 'y_score_mean', 'true_viral']].sort_values('y_score_mean', ascending = False).head(24)
top_percent.true_viral.value_counts()

In [None]:
round5_status = new_df[['id', 'y_pred']]
round5_status.rename(columns={'y_pred': 'round5_pred'}, inplace=True)

rounds12345 = pd.merge(rounds1234, round5_status, on='id', how='outer').fillna(value = 0)

In [None]:
round5 = new_df

# Round 6

In [None]:
new_viral(round5)

In [None]:
y = new_df['viral']
y.value_counts()

In [None]:
viral_corr('viral', new_df)

In [None]:
for i in ['society', 'communication']:
    if i in features:
        features.remove(i)

In [None]:
X = new_df[features]
len(features)

In [None]:
model_lr = LogisticRegression()
rfe = RFE(model_lr, 12)
train_score(rfe, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), features)))

In [None]:
model_Bayes = naive_bayes.BernoulliNB()
rfe = RFE(model_Bayes, 12)
train_score(rfe, X, y, 0.2)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), features)))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400], 'n_jobs': [-1]}
model_rf = GridSearchCV(RandomForestClassifier(max_features = 30, max_depth = 1, random_state = 1234), 
                        param_grid=param_grid, 
                        cv=10, 
                        scoring='recall')
train_score(model_rf, X, y, 0.2)
print(model_rf.best_params_)

In [None]:
model_rf = RandomForestClassifier(max_features = 30, max_depth = 1, random_state = 1234, 
                        n_estimators = 100,
                        n_jobs = -1)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state = 1234)

model_rf.fit(xtrain, ytrain)

importances = model_rf.feature_importances_
indices = np.argsort(importances)[::-1]

for i in indices[:20]:
    print (features[i], round(importances[i], 3))

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_gbt = GridSearchCV(GradientBoostingClassifier(max_features = 10, max_depth = 1, random_state = 1234 ), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_gbt, X, y, 0.2)

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ada = GridSearchCV(AdaBoostClassifier(random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ada, X, y, 0.2)

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400]}
model_ext = GridSearchCV(ExtraTreesClassifier(max_features = 25, max_depth = 1, random_state = 1234), 
                         param_grid=param_grid, 
                         cv=10, 
                         scoring='recall')
train_score(model_ext, X, y, 0.2)

In [None]:
ensemble(new_df, 'rf', 'rf', 'rf', 0.5)

In [None]:
new_df.loc[(new_df['true_viral'] == 1.0) & (new_df['y_pred'] == 0.0)]

In [None]:
new_df['y_score_mean'] = (new_df['y_score_rf'] + new_df['y_score_rf'] + new_df['y_score_rf'] ) / 3
top_percent = new_df[['speaker', 'y_score_mean', 'true_viral']].sort_values('y_score_mean', ascending = False).head(24)
top_percent.true_viral.value_counts()

In [None]:
round6_status = new_df[['id', 'y_pred']]
round6_status.rename(columns={'y_pred': 'round6_pred'}, inplace=True)

rounds123456 = pd.merge(rounds12345, round6_status, on='id', how='outer').fillna(value = 0)

In [None]:
for i in range(2, 7):
    name = 'round' + str(i) + '_pred'
    rounds123456[name] = rounds123456[name].map(lambda x: int(x))

In [None]:
rounds123456.to_csv('round-by-round status.csv')