In [1]:
#do this later from admin: !python -m pip install --upgrade pip


# TOC

0. [Imports](#first-bullet)
1. [EDA](#1.0-bullet)
2. [Feature Engineering](#2.0-bullet)  
 2.1 [Pt 1 - Regression](#2.1-bullet)  
 2.2 [Pt 2 - Classification](#2.2-bullet)
3. [Basic Model](#3.0-bullet)  
 3.1 [Pt 1 - Regression](#3.1-bullet)  
 3.2 [Pt 2 - Classification](#3.2-bullet)
4. [Reference and Planning](#4.0-bullet)

# Do not Run All cells - lots of charts

In [1]:
#Basic
import pandas as pd
import numpy as np
#Math
import scipy as sp
from random import randint
from math import exp
import operator

#Scraping
#import requests
#import json

#Youtube Data API Packages: 
"""from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser
from googleapiclient.discovery import build
import argparse
from googleapiclient.errors import HttpError"""

#Modeling
"""from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
#from sklearn.linear_model import LogisticRegression
#from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.cluster import KMeans
#from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve"""

#NLP & Text Management
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from textblob import TextBlob
from wordcloud import WordCloud
from nltk.stem import PorterStemmer

#Time analysis
import time
import datetime
"""from statsmodels.tsa.stattools import pacf
from statsmodels.graphics.tsaplots import plot_pacf"""

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
#from wordcloud import WordCloud

%matplotlib inline
plt.style.use('seaborn-whitegrid')
sns.set_style('whitegrid')

In [2]:
df_main = pd.read_csv('./data/clean_data_nocomments_noviews_10.10.18.csv')

#create log features
df_main['view_log'] = df_main['viewCount'].map(lambda x: 0 if x==0 else np.log10(x)) #Which log to use???
df_main['comment_log'] = df_main['commentCount'].map(lambda x: 0 if x==0 else np.log10(x))
df_main['like_log'] = df_main['likeCount'].map(lambda x: 0 if x==0 else np.log10(x))
df_main['dislike_log'] = df_main['dislikeCount'].map(lambda x: 0 if x==0 else np.log10(x))
df_main['view_class'] = df_main.view_log.astype(int)

df_main.publishedAt = pd.to_datetime(df_main.publishedAt)

df_main.description.fillna('', inplace=True)
df_main.tags.fillna('', inplace=True)

Run_Text_Analysis = False

# <a class="anchor" id="2.0-bullet"> 2. Feature Engineering</a>

## <a class="anchor" id="2.1-bullet">2.1 Adding Features</a>



### Date Time
 * Week Number
 * is Weekend
 * Year

In [3]:
df_main['week'] = df_main.publishedAt.map(lambda x: x.week)

In [4]:
df_main['is_weekend'] = df_main.publishedAt.dt.weekday.map(lambda x: 1 if x == 5 or x==6 else 0)

In [5]:
df_main['is_friday'] = df_main.publishedAt.dt.weekday.map(lambda x: 1 if x == 4 else 0)

In [6]:
# df_main['year'] exists

### Description
 * Sentiment description_sentiment
 * Overall word count description_wordcount
 * CVEC matrix

In [7]:
df_main['description_sentiment'] = df_main['description'].map(lambda x: TextBlob(str(x)).sentiment.polarity) #Title)

In [8]:
df_main['description_wordcount'] = df_main['description'].map(lambda x: str(x).count(' '))

In [9]:
if Run_Text_Analysis: 
    corp = ''
    for i in range(df_main.shape[0]):
        corp += re.sub(r'http[^ ]+', '',df_main.iloc[i]['description']).replace("\r", '').replace("\n", '')

    df_corp= pd.DataFrame([corp], index=[1])
    cvec = CountVectorizer()
    df_cvec_description = cvec.fit_transform(df_corp[0]).todense()
    df_cvec_description = pd.DataFrame(df_cvec_description, columns = cvec.get_feature_names())

    descr_words = [col for col in df_cvec_description.columns if df_cvec_description[col].sum()>200]

    lmtzr = PorterStemmer()
    lem_list = []
    for word in descr_words:
        lem_list.append(lmtzr.stem(word))
    lem_list = list(set(lem_list))
    df_descr_word_vec = df_main.loc[:, ['description', 'vidId']]

    for i in range(df_descr_word_vec.shape[0]):
        df_descr_word_vec.at[i,'description'] = ' '.join([lmtzr.stem(word) for word in (re.sub(r'http[^ ]+', '', \
                                                                                               df_descr_word_vec.description[i]).replace("\r", '').replace("\n", '')).split(' ')])

    for word in lem_list:
        df_descr_word_vec[str('descr_word_' + word)] = df_descr_word_vec.description.str.count(word)

    df_descr_word_vec.to_csv('./data/description_wordvec2.csv')

### Tags
 * Sentiment
 * CVEC
 * Num tags

In [10]:
df_main['tags_sentiment'] = df_main['tags'].map(lambda x: TextBlob(str(x)).sentiment.polarity)

df_main['tags_wordcount'] = df_main['tags'].map(lambda x: str(x).count(','))

In [11]:
if Run_Text_Analysis: 
    corp = ''
    for i in range(df_main.shape[0]):
        try:
            corp += re.sub(r'http[^ ]+', '',df_main.iloc[i]['tags']).replace("\r", '').replace("\n", '')
        except:
            next

    df_corp= pd.DataFrame([corp], index=[1])
    cvec = CountVectorizer()
    df_cvec_tags = cvec.fit_transform(df_corp[0]).todense()
    df_cvec_tags = pd.DataFrame(df_cvec_tags, columns = cvec.get_feature_names())

    tags_words = [col for col in df_cvec_tags.columns if df_cvec_tags[col].sum()>200]

    lmtzr = PorterStemmer()
    lem_list = []
    for word in tags_words:
        lem_list.append(lmtzr.stem(word))
    lem_list = list(set(lem_list))
    df_tags_word_vec = df_main.loc[:, ['tags', 'vidId']]

    for i in range(df_tags_word_vec.shape[0]):
        try:
            df_tags_word_vec.at[i,'tags'] = ' '.join([lmtzr.stem(word) for word in (re.sub(r'http[^ ]+', '', \
                                                                                           df_tags_word_vec.tags[i]).replace("\r", '').replace("\n", '')).split(' ')])
        except:
            df_tags_word_vec.at[i,'tags'] = ''

    for word in lem_list:
        df_tags_word_vec[str('tags_word_' + word)] = df_tags_word_vec.tags.str.count(word)

    df_tags_word_vec.fillna(0, inplace=True)

    #df_tags_word_vec.to_csv('./data/tag_wordvec2.csv')

### Title
 * Sentiment
 * Wordcount
 * CVEC predictions
 * feat / ft. 
 * Which letters
 * Length

In [12]:
df_main['title_sentiment'] = df_main['title'].map(lambda x: TextBlob(str(x)).sentiment.polarity)
df_main['title_wordcount'] = df_main['title'].map(lambda x: str(x).count(' '))

In [13]:
feat_words = ['feat', 'ft', 'featur']
def find_feats(text_string):
    text_string = text_string.lower()
    for word in feat_words:
        if word in text_string:
            return 1
    return 0
df_main['title_featuring'] = df_main['title'].map(lambda x: find_feats(x))

In [14]:
df_main[df_main['title_featuring']==1]['viewCount'].mean() / df_main[df_main['title_featuring']==0]['viewCount'].mean()

2.8603427786398767

In [15]:
letters = 'abcdefghijklmnopqrstuvwxyz'
for letter in letters:
    df_main[str('intitle_' + letter)] = df_main.title.str.count(letter)

In [16]:
df_main.intitle_e.corr(df_main.viewCount)

-4.909150499504149e-05

In [17]:
dict_letters = {}
for letter in letters:
     dict_letters[letter] = df_main['intitle_'+letter].corr(df_main.viewCount)

sorted_by_value = sorted(dict_letters.items(), key=lambda kv: kv[1])

sorted_by_value

[('p', -0.04000300086280226),
 ('s', -0.03754157090929833),
 ('b', -0.03595935668289338),
 ('m', -0.033256677040368686),
 ('r', -0.032486695162294),
 ('v', -0.031097787374934398),
 ('z', -0.023971870954070173),
 ('g', -0.02137616539138603),
 ('o', -0.020916860900920094),
 ('x', -0.018601016174218863),
 ('j', -0.01761572654265915),
 ('w', -0.015499996520572504),
 ('q', -0.009614216359448539),
 ('n', -0.005906172544945663),
 ('t', -0.0017217805286332197),
 ('e', -4.909150499504149e-05),
 ('y', 0.0007098444506370824),
 ('c', 0.000986900500183664),
 ('u', 0.001448120383550104),
 ('h', 0.0051633083765732415),
 ('l', 0.007845902261759201),
 ('d', 0.02424417914699086),
 ('a', 0.028695981590447797),
 ('k', 0.03151026259519688),
 ('i', 0.0460620475440144),
 ('f', 0.13754559376827483)]

Look at f go!

In [18]:
df_main['title_length'] = df_main['title'].map(lambda x: len(str(x)))

In [19]:
df_main.title_length.corr(df_main.view_log)

0.04400583186930594

In [20]:
if Run_Text_Analysis: 
    corp = ''
    for i in range(df_main.shape[0]):
        try:
            corp += re.sub(r'http[^ ]+', '',df_main.iloc[i]['title']).replace("\r", '').replace("\n", '')
        except:
            next

    df_corp= pd.DataFrame([corp], index=[1])
    cvec = CountVectorizer()
    df_cvec_title = cvec.fit_transform(df_corp[0]).todense()
    df_cvec_title = pd.DataFrame(df_cvec_title, columns = cvec.get_feature_names())

    title_words = [col for col in df_cvec_title.columns if df_cvec_title[col].sum()>200]

    lmtzr = PorterStemmer()
    lem_list = []
    for word in title_words:
        lem_list.append(lmtzr.stem(word))
    lem_list = list(set(lem_list))
    df_title_word_vec = df_main.loc[:, ['title', 'vidId']]

    for i in range(df_title_word_vec.shape[0]):
        try:
            df_title_word_vec.at[i,'title'] = ' '.join([lmtzr.stem(word) for word in (re.sub(r'http[^ ]+', '', \
                                                                                           df_title_word_vec.title[i]).replace("\r", '').replace("\n", '')).split(' ')])
        except:
            df_title_word_vec.at[i,'title'] = ''

    for word in lem_list:
        df_title_word_vec[str('title_word_' + word)] = df_title_word_vec.title.str.count(word)

    df_title_word_vec.fillna(0, inplace=True)

    df_title_word_vec.to_csv('./data/title_wordvec2.csv')

### Ratios

#### Likes / Views

In [21]:
df_main[(df_main['likeCount']>0)&(df_main['viewCount']<=5)]['likeCount']

6263     1
8495    82
8532    10
Name: likeCount, dtype: int64

In [22]:
#Messing up the ratios with likecounts on videos with no views
df_main.at[6263, 'likeCount'] = 0
df_main.at[8495, 'likeCount'] = 0
df_main.at[8532, 'likeCount'] = 0


In [23]:
df_main['lv_ratio'] = (df_main['like_log'] / df_main['view_log'])

df_main['lv_ratio'].fillna(1, inplace=True)

df_main.loc[: , ['lv_ratio', 'viewCount', 'view_log']].corr()

Unnamed: 0,lv_ratio,viewCount,view_log
lv_ratio,1.0,0.174481,0.664305
viewCount,0.174481,1.0,0.372559
view_log,0.664305,0.372559,1.0


#### comments / views

In [24]:
df_main[(df_main['commentCount']>0)&(df_main['viewCount']<=0)]['commentCount']

8495    3
Name: commentCount, dtype: int64

In [25]:
df_main.at[8495, 'commentCount'] = 0

In [26]:
df_main['comment_view_ratio'] = (df_main['comment_log'] / df_main['view_log'])

df_main['comment_view_ratio'].fillna(1, inplace=True)

df_main.loc[: , ['comment_view_ratio', 'viewCount', 'view_log']].corr()

Unnamed: 0,comment_view_ratio,viewCount,view_log
comment_view_ratio,1.0,0.231102,0.7213
viewCount,0.231102,1.0,0.372559
view_log,0.7213,0.372559,1.0


In [27]:
#Skip dislike / view: same as comment

#### Dislikes / Likes

In [28]:
df_main[(df_main['dislikeCount']>0)&(df_main['viewCount']<=0)]['dislikeCount']

8495    10
Name: dislikeCount, dtype: int64

In [29]:
df_main.at[8495, 'dislikeCount'] = 0

In [30]:
df_main['like_dislike_ratio'] = (df_main['like_log'] / df_main['dislike_log'])

df_main['like_dislike_ratio'].fillna(1, inplace=True)

df_main.loc[: , ['like_dislike_ratio', 'viewCount', 'view_log']].corr()

Unnamed: 0,like_dislike_ratio,viewCount,view_log
like_dislike_ratio,1.0,-0.137531,-0.517068
viewCount,-0.137531,1.0,0.372559
view_log,-0.517068,0.372559,1.0


#### Deal with infinities

In [31]:
df_main['lv_ratio'].value_counts().sort_index(ascending=False).iloc[:2]

inf          2
1.000000    20
Name: lv_ratio, dtype: int64

In [32]:
df_main['comment_view_ratio'].value_counts().sort_index(ascending=False).iloc[:2]

inf          1
1.000000    22
Name: comment_view_ratio, dtype: int64

In [33]:
df_main['like_dislike_ratio'].value_counts().sort_index(ascending=False).iloc[:2]

inf          787
10.022368      1
Name: like_dislike_ratio, dtype: int64

In [34]:
df_main.lv_ratio = df_main.lv_ratio.map(lambda x: 7 if x == np.inf else x)

df_main.comment_view_ratio = df_main.comment_view_ratio.map(lambda x: 7 if x == np.inf else x)
df_main.like_dislike_ratio = df_main.like_dislike_ratio.map(lambda x: 10 if x == np.inf else x)

# <a class="anchor" id="3.0-bullet"> 3. Output</a>

In [35]:
now_month = datetime.datetime.now().month
now_day = datetime.datetime.now().day
now_hour = datetime.datetime.now().hour
now_minute = datetime.datetime.now().minute
output_title = './data/engineered_data/data_engineered_round2_{}.{}_{}{}.csv'.format(now_month, now_day, now_hour, now_minute)

df_main.to_csv(output_title, index=False)

## Features Description

Date Time
* General: publishedAt
* Friday (is_friday)
* Is weekend (is_weekend)
* week number (week)
* Month day (month_day)
* Month (month)
* 2 digit year number (year)

* Ignoring overall day count - possible overfit to search algorithm

Description
* Sentiment (description_sentiment)
* Overall word count (description_wordcount)
* CVEC (./data/engineered_data/description_wordvec.csv)


Tags
* Sentiment (tags_sentiment)
* Num tags (tags_wordcount)
* CVEC (./data/engineered_data/tag_wordvec.csv)

Title
* Sentiment (title_sentiment)
* Word count (title_wordcount)
* feat / ft. (title_featuring)
* Letter Count (intitle_ + a,b,c,...)
* String length (title_length)
* Which letters
* Length  
* CVEC (./data/engineered_data/title_wordvec.csv)

Ratios
* likes / views (lv_ratio)
* comments / view (comment_view_ratio)
* likes / dislikes (like_dislike_ratio)

Measures (includes log)
* commentCount
* dislikeCount 
* favoriteCount 
* likeCount: 
* viewCount: view_log

Other
* Has content rating restriction (contentRating)
* Has any region Restriction (regionRestriction)
* Has captioned text (caption)
* Content has approved license (licensedContent)
* Duration of video (duration)
* High definition 1 or std (0) (definition)


In [36]:
model_time = ['is_weekend', 'is_friday', 'year_day', 'week', 'week_day', 'month', 'month_day', 'year',  ]
model_text = ['description_sentiment', 'description_wordcount',
  'tags_sentiment', 'tags_wordcount', 'title_sentiment',
 'title_wordcount', 'title_featuring', 'title_length', ]
model_title_letter = ['intitle_a',  'intitle_b', 'intitle_c', 'intitle_d', 'intitle_e', 'intitle_f', 'intitle_g',
 'intitle_h', 'intitle_i', 'intitle_j', 'intitle_k', 'intitle_l', 'intitle_m', 'intitle_n', 'intitle_o',
 'intitle_p', 'intitle_q', 'intitle_r', 'intitle_s', 'intitle_t', 'intitle_u', 'intitle_v', 'intitle_w',
 'intitle_x', 'intitle_y', 'intitle_z',]              
model_ratios = ['lv_ratio', 'comment_view_ratio', 'like_dislike_ratio',]
model_other = ['duration', 'contentRating', 'regionRestriction',  'licensedContent', 'caption', 'definition']
measures = ['likeCount', 'dislikeCount', 'commentCount', 'viewCount',
            'view_log',  'comment_log',  'like_log', 'dislike_log', 
            'view_class' ]
non_model_cols = ['request_token', 'letter_search','channelTitle', 'tags', 'title', 'vidId','publishedAt','favoriteCount',
              'description','defaultLanguage', ]


[col for col in df_main.columns if \
 col not in model_time and \
 col not in model_text and \
col not in model_title_letter and \
col not in model_ratios and \
col not in model_other and 
col not in measures and \
col not in non_model_cols]

[]