## Part A: Subreddit Prediction ##

In [None]:
try:
    from google.colab import files
except:
    import files
    
import pandas as pd

In [None]:
subreddit_train = "coursework_subreddit_train.json"
subreddit_test = "coursework_subreddit_test.json"

!curl -o  $subreddit_train https://storage.googleapis.com/tad2018/coursework_subreddit_train3.json
!curl -o  $subreddit_test https://storage.googleapis.com/tad2018/coursework_subreddit_test3.json

In [None]:
train_threads = pd.read_json(path_or_buf=subreddit_train, lines=True)
train_threads.subreddit = pd.Categorical(train_threads.subreddit)
#print(train_threads.head())
print(train_threads.size)
print(train_threads.info())

In [None]:
test_threads = pd.read_json(path_or_buf=subreddit_test, lines=True)
test_threads.subreddit = pd.Categorical(test_threads.subreddit)
#print(test_threads.head())
print(test_threads.size)
print(test_threads.info())

In [None]:
subreddit_counts = train_threads['subreddit'].value_counts()
#print(subreddit_counts.describe())
top_subbreddits = subreddit_counts.nlargest(20)
top_subbreddits_list = top_subbreddits.index.tolist()
#print(top_subbreddits)

In [None]:
train_labels = train_threads['subreddit']
test_labels = test_threads['subreddit']

### General purpose code

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


def eval_summary(predictions, labels, avg='macro'):
    precision = precision_score(predictions, labels, average=avg)
    recall = recall_score(predictions, labels, average=avg)
    f1 = fbeta_score(predictions, labels, 1, average=avg)
    accuracy = accuracy_score(predictions, labels)
    
    summary = [accuracy, precision, recall, f1]
    report = classification_report(predictions, labels)
    matrix= confusion_matrix(predictions, labels)
    
    return (summary, report, matrix)

def dataframe_from_report(report, index_name=None):
    lines = [l.strip() for l in report.split("\n") if l.strip() != '']
    column_names = [l for l in lines[0].split(" ") if l != '']
    pre_df = {}
    for line in lines[1:-1]:
        spl = line.split(" ")
        values = [float(l) for l in spl[1:] if l != '']
        pre_df[spl[0]] = values
    data = pd.DataFrame.from_dict(pre_df, orient='index')
    data.columns=column_names
    data.index.name=index_name
    return data

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords



nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

regexp_tokenizer = RegexpTokenizer(r'\w+')

def tokenize(string):
  normalized_tokens = list()
  tokens = regexp_tokenizer.tokenize(string)
  for token in tokens:
    if token.lower() not in stop_words and len(token) > 1 and len(token) < 24:
      normalized = token.lower()
      #normalized = stemmer.stem(normalized)
      normalized_tokens.append(normalized)
  return normalized_tokens

### Q1  
Use the text of the thread (title) and body of all comments in a thread to learn a classification model, based on the Scikit Learn package. Your labels are the subreddit that the thread came from. You should conduct experiments using two vectorizers:  

 - CountVectorizer
 - TfidfVectorizer
 
and two classifiers:  

 - LogisticRegression
 - SVCClassifier (i.e. SVM)
 
Evaluate the classifiers (and two `DummyClassifiers`, with `strategy="most_frequent"` and `strategy="stratified"`) using Accuracy, Precision, Recall & F1 measures (macro averages). Report your results for these values in a single table. Also, produce a confusion_matrix for the best classifier. Create a bar chart graph with the F1 score on the Y-axis and the subreddit name on the X-axis

In [None]:
def get_text_from_thread(thread):
    texts_for_thread = [thread['title']]
    for post in thread['posts']:
        if "body" in post:
            t = post['body'].strip()
            if t != '':
                texts_for_thread.append(t)
    return " ".join(texts_for_thread)
  
train_threads['full_thread'] = train_threads.apply(get_text_from_thread, axis=1)
test_threads['full_thread'] = test_threads.apply(get_text_from_thread, axis=1)

full = pd.concat([train_threads['full_thread'], test_threads['full_thread']])
print(len(full))

In [None]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize)
count_vectorizer = CountVectorizer(tokenizer=tokenize)

train_tfidf_matrix = tfidf_vectorizer.fit_transform(train_threads.full_thread.values)
train_cv_matrix = count_vectorizer.fit_transform(train_threads.full_thread.values)

test_tfidf_matrix = tfidf_vectorizer.transform(test_threads.full_thread.values)
test_cv_matrix = count_vectorizer.transform(test_threads.full_thread.values)

In [None]:
import nltk
all_tokens = []
for l in train_threads.full_thread.values:
  all_tokens.extend(tokenize(l))
print(len(all_tokens), len(train_threads.full_thread.values))
word_dist = nltk.FreqDist(all_tokens)

word_dist.most_common(10)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC


most_frequent_cv = DummyClassifier(strategy="most_frequent")
most_frequent_cv.fit(train_cv_matrix, train_labels)

stratified_cv = DummyClassifier(strategy="stratified")
stratified_cv.fit(train_cv_matrix, train_labels)

lr_cv = LogisticRegression()
lr_cv.fit(train_cv_matrix, train_labels)

svc_cv = SVC()
svc_cv.fit(train_cv_matrix, train_labels)


most_frequent_tfidf = DummyClassifier(strategy="most_frequent")
most_frequent_tfidf.fit(train_tfidf_matrix, train_labels)

stratified_tfidf = DummyClassifier(strategy="stratified")
stratified_tfidf.fit(train_tfidf_matrix, train_labels)

svc_tfidf = SVC()
svc_tfidf.fit(train_tfidf_matrix, train_labels)

lr_tfidf = LogisticRegression()
lr_tfidf.fit(train_tfidf_matrix, train_labels)

In [None]:
summaries_list = []

a = ['Dummy Stratified', 'Count Vectorizer']
prediction = stratified_cv.predict(test_cv_matrix)
summary,_,_ = eval_summary(prediction, test_labels, avg='macro')
a.extend(summary)
summaries_list.append(a)

a = ['Dummy Stratified', 'Tf-idf Vectorizer']
prediction = stratified_tfidf.predict(test_tfidf_matrix)
summary,_,_ = eval_summary(prediction, test_labels, avg='macro')
a.extend(summary)
summaries_list.append(a)

a = ['Dummy Most_Frequent', 'Count Vectorizer']
prediction = most_frequent_cv.predict(test_cv_matrix)
summary,_,_ = eval_summary(prediction, test_labels, avg='macro')
a.extend(summary)
summaries_list.append(a)

a = ['Dummy Most_Frequent', 'Tf-idf Vectorizer']
prediction = most_frequent_tfidf.predict(test_tfidf_matrix)
summary,_,_ = eval_summary(prediction, test_labels, avg='macro')
a.extend(summary)
summaries_list.append(a)

a = ['LogisticRegression', 'Count Vectorizer']
prediction = lr_cv.predict(test_cv_matrix)
summary,_,_ = eval_summary(prediction, test_labels, avg='macro')
a.extend(summary)
summaries_list.append(a)

a = ['LogisticRegression', 'Tf-idf Vectorizer']
prediction = lr_tfidf.predict(test_tfidf_matrix)
summary,_,_ = eval_summary(prediction, test_labels, avg='macro')
a.extend(summary)
summaries_list.append(a)

a = ['SVM Classifier', 'Count Vectorizer']
prediction = svc_cv.predict(test_cv_matrix)
summary,_,_ = eval_summary(prediction, test_labels, avg='macro')
a.extend(summary)
summaries_list.append(a)

a = ['SVM Classifier', 'Tf-idf Vectorizer']
prediction = svc_tfidf.predict(test_tfidf_matrix)
summary,_,_ = eval_summary(prediction, test_labels, avg='macro')
a.extend(summary)
summaries_list.append(a)

In [None]:
summaries = pd.DataFrame(summaries_list)
summaries.columns =   ['Model Name', 'Vectorizer', 'Accuracy', 'Precision', 'Recall', 'F-1']
summaries = summaries.sort_values(by=["F-1"], ascending=False)
summaries.to_csv('summarized_performance.csv')
files.download('summarized_performance.csv')
summaries.head(10)

In [None]:
best = lr_cv

In [None]:
prediction = best.predict(test_cv_matrix)
summary,report,matrix = eval_summary(prediction, test_labels, avg='macro')
c_report = dataframe_from_report(report, 'subreddit')

c_matrix = pd.DataFrame(matrix, columns=c_report.index, index=c_report.index)
c_matrix.to_csv('confusion_matrix.csv')
#files.download('confusion_matrix.csv')

import matplotlib.pyplot as plt

print(c_report)
print(matrix)
c_report.to_csv('best_report.csv')
#files.download('best_report.csv')

plt.figure(figsize=(15,10))
plt.bar(range(len(c_report)),c_report['f1-score'])
plt.xticks(range(len(c_report)), c_report.index, rotation='vertical', fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("subreddit", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.tight_layout()
plt.savefig('barchart.png')
files.download('barchart.png')
plt.show()

### Q2  

Parameter tuning. We have previously discussed some parameters such as sublinear_tf, n-gram sizes, and vocabulary sizes in the TfidfVectorizer. Tune the vectorizer & classifier parameters on the training data, namely (Hint: You can use SKlearn's `GridSearchCV`):

 - `TfidfVectorizer`: `sublinear_tf=True` or `False`; n-gram 1 to 3; vocabulary size $5000, 10000, 20000, ALL$
 - `LogisticRegression`: `C` (a regularisation parameter, values $C = 0.1, 1, 10, 100$)
 - `SVCClassifier`: `C` (a penalty parameter, values $C = 0.1, 1, 10, 100$)  
 
Report the best parameters found and the results of the model with those parameters on the test data.

### WARNING: this may take a lot to run

In [None]:
import warnings
warnings.filterwarnings('ignore') # I know there will be some warnings...

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=tokenize)),
    ('classify', SVC())
])

params = {
    'vectorizer__sublinear_tf': [True, False],
    'vectorizer__ngram_range': [(1,1), (1,2), (1,3)],
    'vectorizer__max_features': [5000, 10000, 20000],
    
    # Different classifiers:
    'classify': [LogisticRegression()],
    'classify__C': [0.1, 1, 10, 100]
}

grid = GridSearchCV(pipeline, cv=4, n_jobs=-1, param_grid=params, verbose=1, scoring='f1_macro')

grid.fit(train_threads.full_thread.values, train_labels)

In [None]:
print(grid.best_score_)
print()
print(grid.best_params_)
print()
print(grid.best_estimator_)

In [None]:
best_estimator = grid.best_estimator_


prediction = best_estimator.predict(test_threads.full_thread.values)
report = dataframe_from_report(classification_report(prediction, test_labels), 'subreddit')
matrix= confusion_matrix(prediction, test_labels)

summary, report, matrix = eval_summary( prediction, test_labels)

print(summary)
print(report)
print(matrix)

## Part B: Discourse prediction ##

### Load data ###

In [None]:
discourse_train = "coursework_discourse_train.json"
discourse_test = "coursework_discourse_test.json"

!curl -o  $discourse_train https://storage.googleapis.com/tad2018/coursework_discourse_train.json
!curl -o  $discourse_test https://storage.googleapis.com/tad2018/coursework_discourse_test.json

In [None]:
# The reddit thread structure is nested with posts in a new content.
# This block reads the file as json and creates a new data frame.
import pandas as pd
import json

def load_posts(file):
  # A temporary variable to store the list of post content.
  posts_tmp = list()

  with open(file) as jsonfile:
    for i, line in enumerate(jsonfile):
     # if (i > 2): break
      thread = json.loads(line)
      for post in thread['posts']:
        posts_tmp.append((thread['subreddit'], thread['title'], thread['url'],
                        post['id'], post.get('author', ""), post.get('body', ""), post.get("majority_link", ""), 
                        post.get('post_depth', 0), post.get('author', ""), post.get('majority_type', ""), post.get('in_reply_to', "") ))

# Create the posts data frame.  
  labels = ['subreddit', 'title', 'url', 'id', 'author', 'body', 'majority_link', 
          'post_depth', 'author', 'discourse_type', 'in_reply_to']
  return pd.DataFrame(posts_tmp, columns=labels)

In [None]:
train_posts = load_posts(discourse_train)
# Filter out empty labels
train_posts = train_posts[train_posts['discourse_type'] != ""]

print(train_posts[["body","discourse_type"]].head())
print("Num posts: ", len(train_posts))

The label for the post we will be predicting is in the discourse_type column.

In [None]:
test_posts = load_posts(discourse_test)
# Filter out empty labels
test_posts = test_posts[test_posts['discourse_type'] != ""]

print("Num posts: ", len(test_posts))

In [None]:
train_posts['discourse_type'] = pd.Categorical(train_posts['discourse_type'])
test_posts['discourse_type'] = pd.Categorical(test_posts['discourse_type'])
train_labels = train_posts['discourse_type']
test_labels = test_posts['discourse_type']

Examine the distribution over labels on the training data.

In [None]:
discourse_counts = train_labels.value_counts()
print(discourse_counts.describe())

top_discourse = discourse_counts.nlargest(200)
print(top_discourse)
top_discourse = top_discourse.index.tolist()
print(top_discourse)

### Q3

Build and evaluate a text classification model for comment discourse prediction. You should use the best vector representation (CountVectorizer/TfIdfVectorizer) identified in Q1 above, as input to a LogisticRegression classifier 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

vectorizer = CountVectorizer(tokenizer=tokenize)

train_posts_ = train_posts.body
test_posts_ = test_posts.body

train_matrix = vectorizer.fit_transform(train_posts_.values)
test_matrix = vectorizer.transform(test_posts_.values)

In [None]:
print(train_posts_[0])
print(train_posts_[1])

In [None]:
# Define the features
X_train = train_matrix
X_test = test_matrix

lr = LogisticRegression()
lr.fit(X_train, train_posts['discourse_type'])
predictions = lr.predict(X_test)  

In [None]:
summary, report, matrix = eval_summary(predictions, test_posts['discourse_type'])

c_summary = pd.DataFrame([summary], columns=['Accuracy', 'Precision', 'Recall', 'F1'])

c_summary.to_csv('c_summary_q3.csv')
files.download('c_summary_q3.csv')

c_report = dataframe_from_report(report, 'subreddit')
c_matrix = pd.DataFrame(matrix, columns=c_report.index, index=c_report.index)

c_matrix.to_csv('confusion_matrix_q3.csv')
files.download('confusion_matrix_q3.csv')

c_report.to_csv('report_q3.csv')
files.download('report_q3.csv')

print(c_summary)
print()
print(report)
print()
print(matrix)

### Q4

Discourse classification may be more subtle than subreddit classification. There are many different types of features that could be used to improve the effectiveness of discourse classification:

 - Content + Punctuation
    - Words unigrams, bigrams, and trigrams
    - Using the title of the comment (when it’s the first comment)
    - Character n-grams (n=1,2,3,4)
    - A tokenizer that includes punctuation as tokens instead of removing it (including "?" and "!")
 - Structure
    - The depth of the comment, raw or normalized by the length of the thread
    - The number of sentences, number of words, and number of characters of both the body and the title of the comment
 - Author
    - A binary feature for whether the current author is also the author of the initial post
    - A binary feature for whether the current commenter is the same as the parent commenter
 - Thread features
    - The total number of comments in the discussion
    - Number of unique branches in the discussion tree
    - Whether the discussion originated as a self-post or a link-post
    - Average length of all the branches or threads of discussion in the discussion tree
 - Community
    - The subreddit the post came from
 - Word2Vec / NLP
    - Average/ Max of pre-trained Glove embeddings for the post
    - Average/ Max of gensim word2vec reddit embeddings for the post
    - Whether the post contains particular part-of-speech n-grams (n=2,3,4)

Implement one feature from each of the 6 different feature types above. Combine all features together into a single model, then evaluate; Then ablate the features by learning a model while "leaving one-out", and report the performances. The required evaluation measures and layout of results to include in your report are described below

In [None]:
def load_posts_rich(file):
  # A temporary variable to store the list of post content.
  posts_tmp = list()

  with open(file) as jsonfile:
    for i, line in enumerate(jsonfile):
      thread = json.loads(line)
      thread_author = None
      for post in thread['posts']:
        # NOTE: This could be changed to use additional features from the post or thread.
        # DO NOT change the labels for the test set.
        discourse_type = post.get('majority_type', '')
        
        post_depth = 0
        if 'is_first_post' in post and post['is_first_post']:
          thread_author = post.get('author', None)
        else:
          post_depth = post['post_depth']
          
        features = [
            thread["is_self_post"],          # is_self_post
            len(thread['posts']),            # thread_length
            post_depth,                      # post_depth
            post.get('author', None),        # post_author
            thread_author,                   # thread_author
            thread['subreddit'],             # subreddit
            thread['title'],                 # thread_title
            post.get('body',''),             # body
            discourse_type,                  # discourse_type
        ]
        posts_tmp.append(features)

  # Create the posts data frame.  
  labels = [
      'is_self_post',
      'thread_length',
      'post_depth',
      'post_author',
      'thread_author',
      'subreddit',
      'thread_title',
      'body',
      'discourse_type'
  ]
  return pd.DataFrame(posts_tmp, columns=labels)

In [None]:
train_posts = load_posts_rich(discourse_train)
train_posts = train_posts[train_posts['discourse_type'] != ""]

test_posts = load_posts_rich(discourse_test)
test_posts = test_posts[test_posts['discourse_type'] != ""]

#train_posts = train_posts.sample(int(len(train_posts) * 0.3))
#test_posts = test_posts.sample(int(len(test_posts) * 0.3))

train_labels = train_posts['discourse_type']
test_labels = test_posts['discourse_type']

#print(train_posts.head())
print()
print("Train size", len(train_posts))
print("Test size", len(test_posts))

In [None]:
!pip install git+https://github.com/erikavaris/tokenizer.git
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from tokenizer import tokenizer as r_tokenizer
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, MaxAbsScaler
from sklearn.linear_model import LogisticRegression

In [None]:
stopw = set(stopwords.words('english'))
stopw = stopw - set(['who', 'where','when','why', 'what', 'which', 'how'])
stopw = stopw - set(['no', 'not','doesn\'t'])
stopw = stopw - set(['i','are','is', 'it'])

R = r_tokenizer.RedditTokenizer()

def tknze(string):
  normalized_tokens = list()
  tokens = R.tokenize(string)
  for t in tokens:
    normalized = t.lower()
    if normalized in stopw:
      continue
#    if len(normalized) < 30:
    normalized_tokens.append(normalized)
  return normalized_tokens

tknze("How is it going?")

In [None]:
# Train Word2Vec

import nltk
from nltk.corpus import treebank

nltk.download('treebank')
corpus = nltk.corpus.treebank
treebank_tagged_sentences = corpus.tagged_sents()
tagged_sentences = [sentence for sentence in treebank_tagged_sentences]
tagger = nltk.tag.perceptron.PerceptronTagger(load=False)
tagger.train(tagged_sentences)

In [None]:
def get_pos(string):
  tags = [tag for word, tag in tagger.tag([a for a in string.split(' ') if a != ''])]
  transformed = " ".join(tags)
  return transformed

train_posts['pos'] = train_posts['body'].apply(get_pos)
test_posts['pos'] = test_posts['body'].apply(get_pos)

In [None]:
# Vectorizer & text-based features
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer

post_vectorizer = TfidfVectorizer(tokenizer=tknze, max_features= 10000, ngram_range= (1, 2), sublinear_tf= True)

def get_text(df):
  return df['body'].values 

select_text = FunctionTransformer(get_text, validate=False)

content_punctuation_pipeline = Pipeline([
      ('select_text', select_text), 
      ('vectorizer', post_vectorizer)
])

normal_vectorizer =  TfidfVectorizer(tokenizer=tokenize, sublinear_tf= True)
normal_pipeline = Pipeline([
      ('select_text', select_text), 
      ('vectorizer_normal', normal_vectorizer)
])

# Structure: The depth of the comment, raw or normalized by the length of the thread
def get_post_structure(df):
  values = []
  for i, r in df.iterrows():
    sentences = sent_tokenize(r['body'])
    words = word_tokenize(r['body'])
    length = len(r['body'])
    values.append([len(sentences), len(words), length])
  return values
post_structure = FunctionTransformer(get_post_structure, validate=False)

# Author: A binary feature for whether the current author is also the author of the initial post
import numpy as np
def is_same_author(df):
  lst = []
  for i, r in df.iterrows():
    lst.append(1 if r['post_author'] == r['thread_author'] else 0)
  return np.array(lst).reshape(len(lst), 1)
same_author = FunctionTransformer(is_same_author, validate=False)

# Thread features: The total number of comments in the discussion
def get_thread_length(df):
  return df['thread_length'].values.reshape(len(df),1)
thread_length = FunctionTransformer(get_thread_length, validate=False)

# Community: The subreddit the post came from
def get_subreddit(df):    
  return df['subreddit'].values 
subreddit_hash_function = FunctionTransformer(get_subreddit, validate=False)

subreddit_hash = Pipeline([
      ('get_subrr', subreddit_hash_function), 
      ('vectorizer', CountVectorizer())
])

# Word2Vec / NLP
tokenizer_dashes = RegexpTokenizer(r'\w+')
def simple_tokenizer(string):
  return [token.lower() for token in tokenizer_dashes.tokenize(string)]

w2v_vectorizer = TfidfVectorizer(tokenizer=simple_tokenizer, sublinear_tf= True, ngram_range= (1, 2), max_features=5000)

def get_pos(df):
  return df['pos'].values

select_pos = FunctionTransformer(get_pos, validate=False)

word_2_vec = Pipeline([
      ('select_pos', select_pos), 
      ('w2v_vectorizer', w2v_vectorizer)
])

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

experiments = {
    'all': Pipeline([
            ('union',FeatureUnion([
              ('Content + Punctuation', content_punctuation_pipeline),
              ('Structure', post_structure),
              ('Author', same_author),
              ('Thread features', thread_length),
              ('Community', subreddit_hash),
              ('Word2vec', word_2_vec)
            ])),
            ('classify', LogisticRegression())]),
    'no_w2v': Pipeline([
            ('union',FeatureUnion([
              ('Content + Punctuation', content_punctuation_pipeline),
              ('Structure', post_structure),
              ('Author', same_author),
              ('Thread features', thread_length),
              ('Community', subreddit_hash),
              #('Word2vec', word_2_vec)
            ])),
            ('classify', LogisticRegression())]),
    'no_community': Pipeline([
            ('union',FeatureUnion([
              ('Content + Punctuation', content_punctuation_pipeline),
              ('Structure', post_structure),
              ('Author', same_author),
              ('Thread features', thread_length),
              #('Community', subreddit_hash),
              ('Word2vec', word_2_vec)
            ])),
            ('classify', LogisticRegression())]),
    'no_thread': Pipeline([
            ('union',FeatureUnion([
              ('Content + Punctuation', content_punctuation_pipeline),
              ('Structure', post_structure),
              ('Author', same_author),
              #('Thread features', thread_length),
              ('Community', subreddit_hash),
              ('Word2vec', word_2_vec)
            ])),
            ('classify', LogisticRegression())]),
    'no_author': Pipeline([
            ('union',FeatureUnion([
              ('Content + Punctuation', content_punctuation_pipeline),
              ('Structure', post_structure),
              #('Author', same_author),
              ('Thread features', thread_length),
              ('Community', subreddit_hash),
              ('Word2vec', word_2_vec)
            ])),
            ('classify', LogisticRegression())]),
    'no_structure': Pipeline([
            ('union',FeatureUnion([
              ('Content + Punctuation', content_punctuation_pipeline),
              #('Structure', post_structure),
              ('Author', same_author),
              ('Thread features', thread_length),
              ('Community', subreddit_hash),
              ('Word2vec', word_2_vec)
            ])),
            ('classify', LogisticRegression())]),
    'no_structure': Pipeline([
            ('union',FeatureUnion([
              ('Content + Punctuation', normal_pipeline),
              ('Structure', post_structure),
              ('Author', same_author),
              ('Thread features', thread_length),
              ('Community', subreddit_hash),
              ('Word2vec', word_2_vec)
            ])),
            ('classify', LogisticRegression())]),
}

for experiment in experiments:
  print(experiment)
  experiments[experiment].fit(train_posts, train_labels)
  predictions = experiments[experiment].predict(test_posts)  
  
  
  summary, report, matrix = eval_summary(predictions, test_posts['discourse_type'])

  c_report = dataframe_from_report(report, 'subreddit')

  c_report.to_csv('report_q4_%s.csv' % experiment)
  files.download ('report_q4_%s.csv' % experiment)

  c_matrix = pd.DataFrame(matrix, columns=c_report.index, index=c_report.index)
  print('confusion_matrix_q4_%s.csv' % datos)
  c_matrix.to_csv('confusion_matrix_q4_%s.csv' % experiment)
  files.download ('confusion_matrix_q4_%s.csv' % experiment)

  c_summary = pd.DataFrame.from_dict({
      'accuracy' : [summary[0]],
      'precission': [summary[1]],
      'recall': [summary[2]],
      'f1': [summary[3]]
  })

  c_summary.to_csv('summary_q4_%s.csv' % experiment)
  files.download ('summary_q4_%s.csv' % experiment)

  print(c_summary)
  print()
  print(report)
  print()
  print(matrix)

In [None]:
summary, report, matrix = eval_summary(predictions, test_posts['discourse_type'])

c_report = dataframe_from_report(report, 'subreddit')

datos = "old_subredd_function"

c_report.to_csv('report_q4_%s.csv' % datos)
files.download ('report_q4_%s.csv' % datos)

c_matrix = pd.DataFrame(matrix, columns=c_report.index, index=c_report.index)
print('confusion_matrix_q4_%s.csv' % datos)
c_matrix.to_csv('confusion_matrix_q4_%s.csv' % datos)
files.download ('confusion_matrix_q4_%s.csv' % datos)

c_summary = pd.DataFrame.from_dict({
    'accuracy' : [summary[0]],
    'precission': [summary[1]],
    'recall': [summary[2]],
    'f1': [summary[3]]
})

c_summary.to_csv('summary_q4_%s.csv' % datos)
files.download ('summary_q4_%s.csv' % datos)

print(c_summary)
print()
print(report)
print()
print(matrix)