## Using Logistic Regression in TED talk data

In [23]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

## Using "Male" as binary classification

In [27]:
df1 = pd.read_csv("ted_tube.csv")
df_gen = pd.read_csv("gender.csv")
df = df1.drop(['Unnamed: 0', 'description', 'main_speaker',
              'name', 'title', 'url', 'ratings_list', 
              'tags', 'related_talks', 'ratings', 'speaker_occupation',
               'film_date_time', 'event'], axis=1)
df = pd.concat([df, df_gen], axis=1)
df = df.drop(['Unnamed: 0'], axis=1)
df.head(2)

Unnamed: 0,comments,duration,film_date,languages,num_speaker,published_date,views,comments_per_view,views_per_comment,num_ratings,funny_ratings,funny_rate,speaker_gender
0,4553,1164,1140825600,60,1,1151367060,47227110,9.6e-05,10372.745443,93850,19645,0.209323,male
1,265,977,1140825600,43,1,1151367060,3200520,8.3e-05,12077.433962,2936,544,0.185286,male


In [29]:
# converts categorical data
sex = pd.get_dummies(df['speaker_gender'], drop_first=True)
sex.head()

Unnamed: 0,female,male,mostly_female,mostly_male,unknown
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,0,0,0,1
4,0,1,0,0,0


In [30]:
sex.drop(['female', 'unknown', 'mostly_male', 'mostly_female'], axis=1, inplace=True)
sex.head()


Unnamed: 0,male
0,1
1,1
2,1
3,0
4,1


In [31]:
df.drop(['speaker_gender'], axis=1, inplace=True)
df.head()

Unnamed: 0,comments,duration,film_date,languages,num_speaker,published_date,views,comments_per_view,views_per_comment,num_ratings,funny_ratings,funny_rate
0,4553,1164,1140825600,60,1,1151367060,47227110,9.6e-05,10372.745443,93850,19645,0.209323
1,265,977,1140825600,43,1,1151367060,3200520,8.3e-05,12077.433962,2936,544,0.185286
2,124,1286,1140739200,26,1,1151367060,1636292,7.6e-05,13195.903226,2824,964,0.34136
3,200,1116,1140912000,35,1,1151367060,1697550,0.000118,8487.75,3728,59,0.015826
4,593,1190,1140566400,48,1,1151440680,12005869,4.9e-05,20245.984823,25620,1390,0.054254


In [16]:
df = pd.concat([df, sex], axis=1)
df.head()

Unnamed: 0,comments,duration,film_date,languages,num_speaker,published_date,views,comments_per_view,views_per_comment,num_ratings,funny_ratings,funny_rate,male
0,4553,1164,1140825600,60,1,1151367060,47227110,9.6e-05,10372.745443,93850,19645,0.209323,1
1,265,977,1140825600,43,1,1151367060,3200520,8.3e-05,12077.433962,2936,544,0.185286,1
2,124,1286,1140739200,26,1,1151367060,1636292,7.6e-05,13195.903226,2824,964,0.34136,1
3,200,1116,1140912000,35,1,1151367060,1697550,0.000118,8487.75,3728,59,0.015826,0
4,593,1190,1140566400,48,1,1151440680,12005869,4.9e-05,20245.984823,25620,1390,0.054254,1


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop("male", axis=1),
                                                    df['male'], test_size=0.30,
                                                    random_state=101)

In [18]:
from sklearn.linear_model import LogisticRegression

# create an instance and fit the model
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [19]:
# predictions
Predictions = logmodel.predict(X_test)

In [20]:
#Model Evlautionat
from sklearn.metrics import classification_report
print(classification_report(y_test, Predictions))

              precision    recall  f1-score   support

           0       0.33      0.00      0.01       337
           1       0.56      1.00      0.72       428

   micro avg       0.56      0.56      0.56       765
   macro avg       0.45      0.50      0.36       765
weighted avg       0.46      0.56      0.40       765



## Using "Female" for binary classification

In [33]:
df1 = pd.read_csv("ted_tube.csv")
df_gen = pd.read_csv("gender.csv")
df = df1.drop(['Unnamed: 0', 'description', 'main_speaker',
              'name', 'title', 'url', 'ratings_list', 
              'tags', 'related_talks', 'ratings', 'speaker_occupation',
               'film_date_time', 'event'], axis=1)
df = pd.concat([df, df_gen], axis=1)
df = df.drop(['Unnamed: 0'], axis=1)
df.head(2)

Unnamed: 0,comments,duration,film_date,languages,num_speaker,published_date,views,comments_per_view,views_per_comment,num_ratings,funny_ratings,funny_rate,speaker_gender
0,4553,1164,1140825600,60,1,1151367060,47227110,9.6e-05,10372.745443,93850,19645,0.209323,male
1,265,977,1140825600,43,1,1151367060,3200520,8.3e-05,12077.433962,2936,544,0.185286,male


In [34]:
# converts categorical data
sex = pd.get_dummies(df['speaker_gender'], drop_first=True)
sex.head()

# drop other classes to make it simple
sex.drop(['male', 'unknown', 'mostly_male', 'mostly_female'], axis=1, inplace=True)
sex.head()

# drop soeaker (unneeded and non-categorical)
df.drop(['speaker_gender'], axis=1, inplace=True)
df.head()

Unnamed: 0,comments,duration,film_date,languages,num_speaker,published_date,views,comments_per_view,views_per_comment,num_ratings,funny_ratings,funny_rate
0,4553,1164,1140825600,60,1,1151367060,47227110,9.6e-05,10372.745443,93850,19645,0.209323
1,265,977,1140825600,43,1,1151367060,3200520,8.3e-05,12077.433962,2936,544,0.185286
2,124,1286,1140739200,26,1,1151367060,1636292,7.6e-05,13195.903226,2824,964,0.34136
3,200,1116,1140912000,35,1,1151367060,1697550,0.000118,8487.75,3728,59,0.015826
4,593,1190,1140566400,48,1,1151440680,12005869,4.9e-05,20245.984823,25620,1390,0.054254


In [36]:
# appends female group back to data set
df = pd.concat([df, sex], axis=1)
df.head()

Unnamed: 0,comments,duration,film_date,languages,num_speaker,published_date,views,comments_per_view,views_per_comment,num_ratings,funny_ratings,funny_rate,female
0,4553,1164,1140825600,60,1,1151367060,47227110,9.6e-05,10372.745443,93850,19645,0.209323,0
1,265,977,1140825600,43,1,1151367060,3200520,8.3e-05,12077.433962,2936,544,0.185286,0
2,124,1286,1140739200,26,1,1151367060,1636292,7.6e-05,13195.903226,2824,964,0.34136,0
3,200,1116,1140912000,35,1,1151367060,1697550,0.000118,8487.75,3728,59,0.015826,0
4,593,1190,1140566400,48,1,1151440680,12005869,4.9e-05,20245.984823,25620,1390,0.054254,0


In [37]:
# splitting up training set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop("female", axis=1),
                                                    df['female'], test_size=0.30,
                                                    random_state=101)

In [38]:
from sklearn.linear_model import LogisticRegression

# create an instance and fit the model
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [39]:
# predictions
Predictions = logmodel.predict(X_test)

In [40]:
#Model Evlautionat
from sklearn.metrics import classification_report
print(classification_report(y_test, Predictions))

              precision    recall  f1-score   support

           0       0.75      1.00      0.86       575
           1       0.00      0.00      0.00       190

   micro avg       0.75      0.75      0.75       765
   macro avg       0.38      0.50      0.43       765
weighted avg       0.56      0.75      0.65       765



  'precision', 'predicted', average, warn_for)
