In [None]:
#!pip install convokit
#!pip install nltk
#!pip install tqdm
from convokit import Corpus, download
import pandas as pd
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
import os
#nltk.download('punkt')

## Preprocessing

In [None]:
corpus = Corpus(filename=download("iq2-corpus"))
corpus.print_summary_stats()

Downloading iq2-corpus to /root/.convokit/downloads/iq2-corpus
Downloading iq2-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/iq2-corpus/iq2_corpus.zip (8.7MB)... Done
Number of Speakers: 471
Number of Utterances: 26562
Number of Conversations: 108


In [None]:
debate_df = corpus.get_conversations_dataframe()
# Cast indices to integer
debate_df.index = debate_df.index.astype('int64')

debate_df.head()

Unnamed: 0_level_0,vectors,meta.summary,meta.title,meta.date,meta.url,meta.results,meta.originalid,meta.winner
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,[],Debate description coming soon.,We Should Accept Performance-Enhancing Drugs i...,"Tuesday, January 15, 2008",http://intelligencesquaredus.org/debates/past-...,"{'breakdown': None, 'post': {'undecided': 4.0,...",PerformanceEnhancingDrugs-011508,for
356,[],How do we fix the economy? The U.S. government...,The Rich Are Taxed Enough,"Wednesday, October 24, 2012",http://intelligencesquaredus.org/debates/past-...,"{'breakdown': {'against_for': 5.0, 'undecided_...",102412%20taxes,against
671,[],Debate description coming soon.,Freedom of Expression Must Include the License...,"Tuesday, October 16, 2006",http://intelligencesquaredus.org/debates/past-...,"{'breakdown': None, 'post': {'undecided': 1.0,...",FreedomOfExpression-101806,tie
897,[],Debate description coming soon.,It's Time to End Affirmative Action,"Tuesday, November 13, 2007",http://intelligencesquaredus.org/debates/past-...,"{'breakdown': None, 'post': {'undecided': 6.0,...",Affirmative-Action-111307,against
1177,[],"Smart technology grants us unprecedented, imme...",Smart Technology Is Making Us Dumb,"Wednesday, May 13, 2015",http://intelligencesquaredus.org/debates/past-...,"{'breakdown': {'against_for': 8.0, 'undecided_...",051315%20Smart%20Tech,tie


In [None]:
transcript_df = corpus.get_utterances_dataframe()
transcript_df.head()

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.nontext,meta.segment,meta.paragraphbreaks,meta.speakertype
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,,… And now I’d like to introduce Robert Rosenkr...,Bob Costas,,0,"{'applause': [[0, 29]]}",0,[],mod
1,,"Well thank you very much. And, uh, uh, on beha...",Robert Rosenkranz,0.0,0,"{'applause': [[0, 445]]}",0,[],host
2,,"Thank you again, Bob. So this is the sixth deb...",Bob Costas,1.0,0,"{'pause': [[0, 237]], 'applause': [[0, 396]]}",0,[],mod
3,,Thank you. Everyone in this room uses performa...,Norman Fost,2.0,0,"{'audiencelaughter': [[0, 500], [0, 658]], 'al...",0,"[4071, 6809]",for
4,,"That is deep-, deeply, deeply Freudian.",Bob Costas,3.0,0,"{'audiencelaughter': [[0, 6]]}",0,[],mod


In [None]:
# Remove unused columns
del transcript_df['timestamp'], transcript_df['speaker'], transcript_df['reply_to'], \
    transcript_df['meta.nontext'], transcript_df['meta.segment'], transcript_df['meta.paragraphbreaks']
# Remove non-debaters (i.e. moderators) and change representation to true/false
transcript_df = transcript_df[transcript_df['meta.speakertype'] != 'mod'][transcript_df['meta.speakertype'] != 'host']
transcript_df['speaker_is_pro'] = transcript_df.apply(lambda r: r['meta.speakertype'] == 'for', axis=1)
del transcript_df['meta.speakertype']

# Cast all columns to appropriate types
transcript_df['conversation_id'] = transcript_df['conversation_id'].astype('int64')
transcript_df['speaker_is_pro'] = transcript_df['speaker_is_pro'].astype('bool')
transcript_df.index = transcript_df.index.astype('int64')

transcript_df.head()

  after removing the cwd from sys.path.


Unnamed: 0_level_0,text,conversation_id,speaker_is_pro
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,Thank you. Everyone in this room uses performa...,0,True
5,…eh…Alzado is constantly rolled out as a poste...,0,True
7,Everyone agrees these drugs should be banned f...,0,True
9,Thank you. This evening’s debate deals with on...,0,False
11,…that the athletes set out to accomplish. It t...,0,False


In [None]:
# Remove tied debates from dataframe
old_size = transcript_df.shape[0]
no_tie_filter = debate_df.loc[transcript_df['conversation_id'], 'meta.winner'] != 'tie'
no_tie_filter.index = transcript_df.index
transcript_df = transcript_df[no_tie_filter]
print("Reduced size from {0} examples to {1} examples.".format(
    old_size,
    transcript_df.shape[0]
))

Reduced size from 16045 examples to 15677 examples.


In [None]:
transcript_df['pro_won'] = transcript_df.apply(
    lambda r : debate_df.loc[r['conversation_id'], 'meta.winner'] == 'for',
    axis = 1
)
transcript_df.head()

Unnamed: 0_level_0,text,conversation_id,speaker_is_pro,pro_won
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,Thank you. Everyone in this room uses performa...,0,True,True
5,…eh…Alzado is constantly rolled out as a poste...,0,True,True
7,Everyone agrees these drugs should be banned f...,0,True,True
9,Thank you. This evening’s debate deals with on...,0,False,True
11,…that the athletes set out to accomplish. It t...,0,False,True


In [None]:
min_tokens_in_sentence = 10
sentence_df = pd.DataFrame()
counter = 0
total_count = 0
for index, row in transcript_df.iterrows():
  for tokenized_sentence in sent_tokenize(row['text']):
    if (len(word_tokenize(tokenized_sentence)) >= min_tokens_in_sentence):
      sentence_df = sentence_df.append({
      'sentence': tokenized_sentence, 
      'conversation_id' : row['conversation_id'], 
      'speaker_is_pro' : row['speaker_is_pro'],
      'pro_won' : row['pro_won']
      }, ignore_index = True)
  counter += 1
  if (counter == 1000): 
    total_count += 1
    counter = 0
    print("{0} entries converted.".format(total_count * 1000))
print("All {0} entries converted. {1} sentences in dataset.".format(transcript_df.shape[0],
                                                                    sentence_df.shape[0]))

1000 entries converted.
2000 entries converted.
3000 entries converted.
4000 entries converted.
5000 entries converted.
6000 entries converted.
7000 entries converted.
8000 entries converted.
9000 entries converted.
10000 entries converted.
11000 entries converted.
12000 entries converted.
13000 entries converted.
14000 entries converted.
15000 entries converted.
All 15677 entries converted. 60121 sentences in dataset.


In [None]:
# Cast each column to appropriate type
sentence_df['conversation_id'] = sentence_df['conversation_id'].astype('int64')
sentence_df['pro_won'] = sentence_df['pro_won'].astype('bool')
sentence_df['speaker_is_pro'] = sentence_df['speaker_is_pro'].astype('bool')

In [None]:
FILE_PATH = "./drive/MyDrive/RSA!/"
sentence_df.to_csv(FILE_PATH + 'iq2-sents.csv')
print("{0} sentences to be classified.".format(sentence_df.shape[0]))

data folder exists.
60121 sentences to be classified.


In [None]:
sentence_df.head()

Unnamed: 0,conversation_id,pro_won,sentence,speaker_is_pro
0,0,True,Everyone in this room uses performance-enhanci...,True
1,0,True,We use cars and computers to make our work mor...,True
2,0,True,"We use caffeine, alcohol, and Viagra to improv...",True
3,0,True,We send our children to fancy schools and Suzu...,True
4,0,True,And every athlete in recorded history has used...,True


## Feature Extraction

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
import re
#nltk.download('vader_lexicon')

<h4> Sentiment Analysis </h4>

In [None]:
features = pd.read_csv(FILE_PATH + 'iq2-sents.csv')
s = SentimentIntensityAnalyzer()
all_senti_scores = { row['sentence'] : s.polarity_scores(row['sentence']) for index, row in features.iterrows()}

In [None]:
features['senti_score'] = features['sentence'].apply(lambda x : all_senti_scores[x]['compound'])
features['senti_class_neg'] = features['sentence'].apply(
    lambda x : all_senti_scores[x]['neg'] > all_senti_scores[x]['pos'] and
                all_senti_scores[x]['neg'] > all_senti_scores[x]['neu']
)
features['senti_class_pos'] = features['sentence'].apply(
    lambda x : all_senti_scores[x]['pos'] > all_senti_scores[x]['neg'] and
                all_senti_scores[x]['pos'] > all_senti_scores[x]['neu']
)
features['senti_class_neu'] = features.apply(
    lambda x : (not x['senti_class_pos']) and (not x['senti_class_neg']), 
    axis = 1
)
features.head()

Unnamed: 0.1,Unnamed: 0,conversation_id,pro_won,sentence,speaker_is_pro,senti_score,senti_class_neg,senti_class_pos,senti_class_neu
0,0,0,True,Everyone in this room uses performance-enhanci...,True,0.0,False,False,True
1,1,0,True,We use cars and computers to make our work mor...,True,0.4754,False,False,True
2,2,0,True,"We use caffeine, alcohol, and Viagra to improv...",True,0.4404,False,False,True
3,3,0,True,We send our children to fancy schools and Suzu...,True,0.6369,False,False,True
4,4,0,True,And every athlete in recorded history has used...,True,0.0,False,False,True


<h4> Regex Tonal Features </h4>

In [None]:
regex_features = {
    'confusion1': r"(ˆ| )i (\S + ){,2}(not|n’t|never) (understand|know)",
    'confusion2': r"(not|n’t) make sense",
    'confusion3': r"(ˆ| )i (\S + ){,2}(curious|confused)",
    'confusion4' : r"(ˆ| )i (\S + ){,2}wonder",
    'confusion5' : r"(me|myself) wonder",
    'why_how' : r"(ˆ| )(why|how).*\?",
    'question_other' : r"[?]",
    'prediction': r"(am$|$’m$|$are$|$’re$|$is$|$’s) (not )?(going to$|$gonna)",
    'hypothetical' : r"(ˆ|, )if|unless",
    'citation': r"(said|reported|mentioned|declared|claimed|admitted|explained|insisted|promised|suggested|recommended|denied|blamed|apologized|agreed|answered|argued|complained|confirmed|proposed|replied|stated|told|warned|revealed|according to|) that [ˆ.,!?]",
    'comparison': "(than|compared to)",
    'examples': r"(ˆ| )(for example|for instance|such as|e\.g\.)( |$)",
    'definition' : r"(define|definition)",
    'personal_story' : r"(think|believe|see|know|feel|say|understand|mean|sure|agree|argue|consider|guess|realize|hope|support|aware|disagree|post|mention|admit|accept|assume|convince|wish|appreciate|speak|suppose|doubt|explain|wonder|discuss|view|suggest|recognize|respond|acknowledge|clarify|state|sorry|advocate|propose|define|apologize|curious|figure|claim|concede|debate|list|oppose|describe|suspect|reply|bet|realise|defend|convinced|offend|concern|intend|certain|conclude|reject|challenge|thank|con-done|value|skeptical|contend|anticipate|maintain|justify|recommend|confident|promise|guarantee|comment|unsure|elaborate|posit|swear|dispute|imply|misunderstand)",
    'you' : r"(you|your|yours)", 
    'we' : r"(ˆ| )we |(?<!the) (us|our|ours)( |$)"
}
non_binary_regex = {
    'hedge_pos' : r"(allegedly|apparently|appear to|conceivably|could be|doubtful|fairly|hopefully|i assume|i believe|i do not believe|i doubt|i feel|i do not feel|i guess|ispeculate|i think|i do not think|if anything|imo|imply|in my mind|in my opinion|in myunderstanding|in my view|it be possible|it look like|it do not look like|kind of|mainly|may|maybe|might|my impression be|my thinking be|my understanding be|perhaps|possibly|potentially|presumably|probably|quite|rather|relatively|seem|somehow|somewhat|sort of|supposedly|to my knowledge|virtually|would)",
    'hedge_neg' : r"(be definite|definitely|directly|enormously|entirely|evidently|exactly|explicitly|extremely|fundamentally|greatly|highly|in fact|incredibly|indeed|inevitably|intrinsically|invariably|literally|necessarily|no way|be obvious|obviously|perfectly|precisely|really|be self-evident|be sure|surely|totally|truly|be unambiguous|unambiguously|be undeniable|undeniably|undoubtedly|beunquestionable|unquestionably|very|wholly)",
    'qualification_pos' : r"(a bit|a few|a large amount of|a little|a lot of|a number of|almost|approximately|except|generally|if|in general|largely|likely|lots of|majority of|many|more or less|most|mostly|much|nearly|normally|occasionally|often|overall|partly|plentyof|rarely|roughly|several|some|sometimes|tend|ton of|tons of|typically|unless|unlikely|usually)",
    'qualification_neg' : r"(all|always|every|everybody|everyone|everything|never|no|no one|nobody|none|neither|not any|ever|forever)"
}

In [None]:
for feat_exp in regex_features:
  features[feat_exp] = features['sentence'].apply(lambda x : len(re.findall(regex_features[feat_exp], x)) > 0)
  print("Feature {0} extracted.".format(feat_exp))

Feature confusion1 extracted.
Feature confusion2 extracted.
Feature confusion3 extracted.
Feature confusion4 extracted.
Feature confusion5 extracted.
Feature why_how extracted.
Feature question_other extracted.
Feature prediction extracted.
Feature hypothetical extracted.
Feature citation extracted.
Feature comparison extracted.
Feature examples extracted.
Feature definition extracted.
Feature personal_story extracted.
Feature you extracted.
Feature we extracted.


In [None]:
for feat_exp in non_binary_regex:
  features[feat_exp] = features['sentence'].apply(lambda x : len(re.findall(non_binary_regex[feat_exp], x)))
  print("Feature {0} extracted.".format(feat_exp))

Feature hedge_pos extracted.
Feature hedge_neg extracted.
Feature qualification_pos extracted.
Feature qualification_neg extracted.


In [None]:
features['hedge'] = features[['hedge_pos', 'hedge_neg']].apply(
    lambda x : x['hedge_pos'] - x['hedge_neg'],
    axis = 1
)
features['qualification'] = features[['qualification_pos', 'qualification_neg']].apply(
    lambda x : x['qualification_pos'] - x['qualification_neg'],
    axis = 1
)
del features['hedge_pos']
del features['hedge_neg']
del features['qualification_pos']
del features['qualification_neg']
features[['hedge', 'qualification']]

Unnamed: 0,hedge,qualification
0,-1,-1
1,0,0
2,0,0
3,0,0
4,-1,-1
...,...,...
60116,0,-2
60117,0,-1
60118,0,-1
60119,0,1


In [None]:
features['confusion'] = features.apply(
    lambda row : row['confusion1'] or row['confusion2'] or row['confusion3'] or row['confusion4'] or row['confusion5'], 
    axis=1
)
del features['confusion1']
del features['confusion2']
del features['confusion3']
del features['confusion4']
del features['confusion5']

<h4> Lexicon-Based Features </h4>

In [None]:
features['tokens'] = features['sentence'].apply(lambda x: word_tokenize(x))

In [None]:
AD_df = pd.read_csv(FILE_PATH + 'arousal_dominance_lexicon.csv").loc[:, ['Word', 'A.Mean.Sum', 'D.Mean.Sum']]
AD_df.index = AD_df['Word']
del AD_df['Word']
AD_df

Unnamed: 0_level_0,A.Mean.Sum,D.Mean.Sum
Word,Unnamed: 1_level_1,Unnamed: 2_level_1
aardvark,2.41,4.27
abalone,2.65,4.95
abandon,3.73,3.32
abandonment,4.95,2.64
abbey,2.20,5.00
...,...,...
zone,3.78,5.23
zoning,3.77,4.47
zoo,5.63,6.33
zoom,5.68,5.90


In [None]:
arousal_df = AD_df.loc[:, 'A.Mean.Sum']
dominance_df = AD_df.loc[:, 'D.Mean.Sum']

In [None]:
concrete_df = pd.read_csv(FILE_PATH + 'concreteness_lexicon.csv').loc[:, ['Word', 'Conc.M']]
concrete_df.index = concrete_df['Word']
del concrete_df['Word']
concrete_df

Unnamed: 0_level_0,Conc.M
Word,Unnamed: 1_level_1
a,1.46
aardvark,4.68
aback,1.65
abacus,4.52
abandon,2.54
...,...
zebra crossing,4.56
zero tolerance,2.21
ZIP code,3.77
zoom in,3.57


In [None]:
subjectivity_dict = {}
with open(FILE_PATH + 'subjectivity_lexicon') as file:
  for i, line in enumerate(file.readlines()):
    word = re.findall('word1=(.*?) pos1=', line)[0]
    polarity = -1 if re.findall('priorpolarity=(.*?)\n', line)[0] == 'negative' else 1
    strength = 0.5 if re.findall('type=(.*?) len=', line)[0] == 'weaksubj' else 1
    subjectivity_dict[word] = polarity * strength
subjective_df = pd.DataFrame(subjectivity_dict.values(), index = subjectivity_dict.keys())
subjective_df['Subjectivity'] = subjective_df[0]
del subjective_df[0]
subjective_df

Unnamed: 0,Subjectivity
abandoned,-0.5
abandonment,-0.5
abandon,-0.5
abase,-1.0
abasement,-1.0
...,...
zealot,-1.0
zealous,-1.0
zealously,-1.0
zenith,1.0


In [None]:
def average_df_value_on_tokenized_sentence(tokens, df):
  count = 0
  total_score = 0.0
  for word in tokens:
    try:
      total_score += df[word]
      count += 1
    except KeyError:
      pass
  return total_score/count if count != 0 else 0.0

for feat, df in zip(["arousal", "dominance", "concreteness", "subjectivity"], [arousal_df, dominance_df, concrete_df, subjective_df]):
  features[feat] = features['tokens'].apply(lambda x : average_df_value_on_tokenized_sentence(x, df))
  print("Feature {0} extracted.".format(feat))

Feature arousal extracted.
Feature dominance extracted.
Feature concreteness extracted.
Feature subjectivity extracted.


<h4> Kialo Knowledge Base Features </h4>

In [None]:
kialo_df = pd.read_csv(FILE_PATH + "kialo.csv")
kialo_df['tokens'] = kialo_df['text'].apply(lambda x : word_tokenize(x))
kialo_df.head()

Unnamed: 0,did,cid,author,parent,relation,deleted,rel-pro,rel-neu,rel-con,text,tokens
0,11371,11371.0,f09a6d28-8109-4b76-ba87-41e639b5d662,,,0,0,1,0,,[nan]
1,11371,11371.1,f09a6d28-8109-4b76-ba87-41e639b5d662,11371.0,0.0,0,7,0,19,India should return to a paper ballot.,"[India, should, return, to, a, paper, ballot, .]"
2,11371,11371.2,f09a6d28-8109-4b76-ba87-41e639b5d662,11371.1,1.0,1,0,0,0,Yes. It should be done,"[Yes, ., It, should, be, done]"
3,11371,11371.3,f09a6d28-8109-4b76-ba87-41e639b5d662,11371.1,1.0,1,0,0,0,Congress is right,"[Congress, is, right]"
4,11371,11371.4,f09a6d28-8109-4b76-ba87-41e639b5d662,11371.1,1.0,1,0,0,0,We need a review on the issue,"[We, need, a, review, on, the, issue]"


In [None]:
kialo_df['unique_tokens'] = kialo_df['tokens'].apply(lambda x : set(x))
features['matched_kialo_statements'] = features['tokens'].apply(
    lambda s : list(abridged_kialo[abridged_kialo['unique_tokens'].apply(
        lambda r : len(set(s).intersection(r)) >= 5
    )].index)
)
print("{0} matches found in knowledge base.".format(
    features['matched_kialo_statements'].map(
        lambda x : len(x)
      ).sum()
    )
)

914 matches found in knowledge base.


In [None]:
kialo_df['rel-pro'] = kialo_df['rel-pro'].astype('int64')
kialo_df['rel-neu'] = kialo_df['rel-neu'].astype('int64')
kialo_df['rel-con'] = kialo_df['rel-con'].astype('int64')

In [None]:
# Frequency
features['kialo_frequency'] = features['matched_kialo_statements'].apply(
    lambda x : np.log(len(x) + 1)/np.log(2)
)

In [None]:
# Attractiveness
features['kialo_avg_num_responses'] = features['matched_kialo_statements'].apply(
    lambda x : kialo_df.loc[x].apply(
        lambda c : c['rel-neu'] + c['rel-pro'] + c['rel-con'],
        axis = 1
    ).sum()/len(x) if len(x) > 0 else 0.0
)
features['kialo_attractiveness'] = features['kialo_avg_num_responses'].apply(
    lambda x : np.log(x + 1) / np.log(2)
)

In [None]:
# Extremeness
def safe_div(a, b):
  return 0 if b == 0 else a * 1.0 / b

kialo_df['pro_proportion'] = kialo_df.apply(
    lambda x : safe_div(x['rel-pro'], x['rel-pro'] + x['rel-neu'] + x['rel-con']), 
    axis = 1
)
kialo_df['neg_proportion'] = kialo_df.apply(
    lambda x : safe_div(x['rel-con'], x['rel-pro'] + x['rel-neu'] + x['rel-con']),
    axis = 1
)
features['kialo_extremeness'] = features['matched_kialo_statements'].apply(
    lambda x : kialo_df.loc[x].apply(
        lambda c : abs(c['pro_proportion'] - c['neg_proportion']), 
        axis = 1
    ).sum()/len(x) if len(x) > 0 else 0.0
)

In [None]:
del features['sentence']
del features['tokens']
del features['matched_kialo_statements']
features.head()

Unnamed: 0,conversation_id,pro_won,speaker_is_pro,senti_score,senti_class_neg,senti_class_pos,senti_class_neu,why_how,question_other,prediction,hypothetical,citation,comparison,examples,definition,personal_story,you,we,confusion,arousal,dominance,concreteness,subjectivity,hedge,qualification,kialo_frequency,kialo_avg_num_responses,kialo_attractiveness,kialo_extremeness
0,0,True,True,0.0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,4.5,5.076667,0.0,0.0,-1,-1,0.0,1.0,1.0,0.0
1,0,True,True,0.4754,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,4.0125,6.0025,0.0,0.0,0,0,0.0,0.0,0.0,0.0
2,0,True,True,0.4404,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,5.162,5.534,0.0,0.0,0,0,0.0,0.0,0.0,0.0
3,0,True,True,0.6369,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,4.7925,6.265,0.0,0.0,0,0,0.0,0.0,0.0,0.0
4,0,True,True,0.0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,4.636667,5.026667,0.0,0.0,-1,-1,0.0,0.0,0.0,0.0


<h1> Predicting IQ2 Debate Results <h1>

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from itertools import product

In [None]:
aggregate_df = pd.DataFrame()
for i in sorted(set(features['conversation_id'])):
  conv_speeches = features[features['conversation_id'] == i]
  pro_details = conv_speeches[conv_speeches['speaker_is_pro']]
  con_details = conv_speeches[~conv_speeches['speaker_is_pro']]
  debate_details = {
      'conversation_id' : i, 
      'pro_won' : conv_speeches['pro_won'].all(),
  }
  for side, performance in zip(["+", "-"], [pro_details, con_details]):
    for col in features.columns:
      if (col != 'conversation_id' and col != 'pro_won' and col != 'speaker_is_pro'):
        debate_details[side + col] = performance[col].sum()
  aggregate_df = aggregate_df.append(debate_details, ignore_index = True)
print("There are {0} rows in this DataFrame, the same as the number of unique debates".format(aggregate_df.shape[0]))
aggregate_df.head()

There are 105 rows in this DataFrame, the same as the number of unique debates


Unnamed: 0,+arousal,+citation,+comparison,+concreteness,+confusion,+definition,+dominance,+examples,+hedge,+hypothetical,+kialo_attractiveness,+kialo_avg_num_responses,+kialo_extremeness,+kialo_frequency,+personal_story,+prediction,+qualification,+question_other,+senti_class_neg,+senti_class_neu,+senti_class_pos,+senti_score,+subjectivity,+we,+why_how,+you,-arousal,-citation,-comparison,-concreteness,-confusion,-definition,-dominance,-examples,-hedge,-hypothetical,-kialo_attractiveness,-kialo_avg_num_responses,-kialo_extremeness,-kialo_frequency,-personal_story,-prediction,-qualification,-question_other,-senti_class_neg,-senti_class_neu,-senti_class_pos,-senti_score,-subjectivity,-we,-why_how,-you,conversation_id,pro_won
0,1574.994766,0.0,22.0,0.0,0.0,3.0,2153.194522,2.0,29.0,17.0,4.584963,6.0,2.0,9.0,191.0,0.0,-170.0,21.0,4.0,384.0,0.0,24.0403,0.0,80.0,4.0,72.0,1736.443604,0.0,13.0,0.0,0.0,1.0,2405.731255,0.0,34.0,22.0,5.584963,7.0,2.333333,10.0,204.0,0.0,-185.0,32.0,7.0,438.0,3.0,38.1054,0.0,66.0,3.0,133.0,0.0,1.0
1,1115.527371,0.0,11.0,0.0,0.0,1.0,1500.49332,0.0,-7.0,13.0,1.0,1.0,1.0,2.0,90.0,0.0,-118.0,13.0,1.0,275.0,1.0,36.8296,0.0,55.0,4.0,79.0,1325.964678,0.0,22.0,0.0,0.0,0.0,1871.477186,0.0,-21.0,16.0,2.0,3.0,0.333333,4.0,156.0,0.0,-92.0,36.0,2.0,329.0,6.0,60.2615,0.0,96.0,2.0,99.0,356.0,0.0
2,1098.6494,0.0,12.0,0.0,0.0,0.0,1581.439416,0.0,3.0,12.0,0.0,0.0,0.0,2.0,150.0,0.0,-78.0,3.0,0.0,278.0,1.0,30.8593,0.0,68.0,2.0,49.0,1382.932748,0.0,29.0,0.0,0.0,2.0,2007.6881,0.0,32.0,18.0,3.321928,5.0,1.5,7.0,222.0,0.0,-176.0,13.0,1.0,352.0,1.0,59.0412,0.0,72.0,2.0,94.0,897.0,0.0
3,1162.115615,0.0,14.0,0.0,0.0,0.0,1554.295147,0.0,-10.0,11.0,2.0,3.0,0.333333,3.0,136.0,0.0,-87.0,11.0,4.0,286.0,2.0,-12.5153,0.0,95.0,1.0,59.0,1055.260401,0.0,15.0,0.0,0.0,5.0,1431.390759,2.0,4.0,6.0,2.807355,6.0,1.0,2.0,135.0,0.0,-146.0,12.0,2.0,262.0,0.0,0.8334,0.0,84.0,1.0,48.0,1406.0,0.0
4,1107.684393,0.0,7.0,0.0,0.0,1.0,1490.764758,0.0,-46.0,6.0,0.0,0.0,0.0,3.0,108.0,0.0,-178.0,20.0,0.0,269.0,0.0,26.8183,0.0,71.0,4.0,61.0,1423.924808,0.0,23.0,0.0,0.0,6.0,1926.964914,2.0,35.0,6.0,3.0,4.0,1.333333,6.0,139.0,0.0,-103.0,36.0,1.0,348.0,2.0,36.5066,0.0,54.0,10.0,97.0,1595.0,1.0


In [None]:
X_df = aggregate_df.loc[:, aggregate_df.columns != 'pro_won']
X_df = X_df.loc[:, X_df.columns != 'conversation_id']
y_df = aggregate_df['pro_won']
X = X_df.to_numpy()
y = y_df.to_numpy()

In [None]:
training_examples = int(0.8 * X.shape[0])
X_train = X[:training_examples]
y_train = y[:training_examples]
X_test = X[training_examples:]
y_test = y[training_examples:]
print("{0} training examples each with {1} features. {2} testing examples.".format(X_train.shape[0], X_train.shape[1], X_test.shape[0]))

84 training examples each with 52 features. 21 testing examples.


In [None]:
reg_strengths = [1e-1, 1e-2, 1e-3, 1e-4]
penalties = ['l1', 'l2']
losses = ['hinge', 'log']
early_stopping_options = [True, False]

def train(X, y, alpha = 0.1, loss = 'log', penalty = 'l2', early_stopping=False):
  classifier = make_pipeline(
      StandardScaler(),
      SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, early_stopping=early_stopping)
  )
  classifier.fit(X, y)
  return classifier

In [None]:
best_classifier = None
best_test_acc = 0.0
best_settings = {}
all_results = []
for alpha, penalty, loss, early_stopping in product(reg_strengths, penalties, losses, early_stopping_options):
  print("Regularization strength = {0}. Penalty = {1}. Loss = {2}. Early stopping {3}.".format(alpha, penalty, loss, "enabled" if early_stopping else "disabled"))
  classifier = train(X_train, y_train, alpha=alpha, penalty=penalty, loss=loss, early_stopping=early_stopping)
  train_acc = classifier.score(X_train, y_train)
  print("\tTraining accuracy = " + str(train_acc*100) + "%")
  test_acc = classifier.score(X_test, y_test)
  print("\tTesting accuracy = " + str(test_acc*100) + "%")

  all_results.append({
      'alpha' : alpha,
      'penalty' : penalty,
      'loss' : loss,
      'early_stopping' : early_stopping,
      'final_train_acc' : train_acc,
      'final_test_acc' : test_acc,
      'classifier' : classifier
  })

  if (best_test_acc < test_acc):
    best_classifier = classifier
    best_test_acc = test_acc
    best_settings['alpha'] = alpha
    best_settings['penalty'] = penalty
    best_settings['loss'] = loss
    best_settings['early_stopping'] = early_stopping
print(
    "\nThe best testing accuracy was {0}, acheived by training by SGD on {1} loss with {2} regularization and regularization strength set to {3} {4} early stopping.".format(
        best_test_acc, best_settings['loss'], best_settings['penalty'], best_settings['alpha'], 'with' if best_settings['early_stopping'] else 'without'
    )
)

Regularization strength = 0.1. Penalty = l1. Loss = hinge. Early stopping enabled.
	Training accuracy = 58.333333333333336%
	Testing accuracy = 57.14285714285714%
Regularization strength = 0.1. Penalty = l1. Loss = hinge. Early stopping disabled.
	Training accuracy = 67.85714285714286%
	Testing accuracy = 57.14285714285714%
Regularization strength = 0.1. Penalty = l1. Loss = log. Early stopping enabled.
	Training accuracy = 63.095238095238095%
	Testing accuracy = 47.61904761904761%
Regularization strength = 0.1. Penalty = l1. Loss = log. Early stopping disabled.
	Training accuracy = 61.904761904761905%
	Testing accuracy = 47.61904761904761%
Regularization strength = 0.1. Penalty = l2. Loss = hinge. Early stopping enabled.
	Training accuracy = 76.19047619047619%
	Testing accuracy = 71.42857142857143%
Regularization strength = 0.1. Penalty = l2. Loss = hinge. Early stopping disabled.
	Training accuracy = 78.57142857142857%
	Testing accuracy = 71.42857142857143%
Regularization strength = 