# NLP assignment 3 - create datasets

## Imports

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import json
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [0]:
#Allow access to Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


## Load in the data using the script from the SARC GitHub
https://github.com/NLPrinceton/SARC

In [0]:
def load_sarc_responses(train_file, test_file, comment_file, lower=True):
  '''loads SARC data from csv files
  Args:
    train_file: csv file with train sequences
    test_file: csv file with train sequences
    comment_file: json file with details about all comments
    lower: boolean; if True, converts comments to lowercase
  Returns:
    train_sequences, train_labels, test_sequences, test_labels
    train_sequences: {'ancestors': list of ancestors for all sequences,
                      'responses': list of responses for all sequences}
    train_labels: list of labels for responses for all sequences.
  '''

  with open(comment_file, 'r') as f:
    comments = json.load(f)

  train_docs = {'ancestors': [], 'responses': []}
  train_labels = []
  with open(train_file, 'r') as f:
    reader = csv.reader(f, delimiter='|')
    for row in reader:
      ancestors = row[0].split(' ')
      responses = row[1].split(' ')
      labels = row[2].split(' ')
      if lower:
        train_docs['ancestors'].append([comments[r]['text'].lower() for r in ancestors])
        train_docs['responses'].append([comments[r]['text'].lower() for r in responses])
      else:
        train_docs['ancestors'].append([comments[r]['text'] for r in ancestors])
        train_docs['responses'].append([comments[r]['text'] for r in responses])
      train_labels.append(labels)

  test_docs = {'ancestors': [], 'responses': []}
  test_labels = []
  with open(test_file, 'r') as f:
    reader = csv.reader(f, delimiter='|')
    for row in reader:
      ancestors = row[0].split(' ')
      responses = row[1].split(' ')
      labels = row[2].split(' ')
      if lower:
        test_docs['ancestors'].append([comments[r]['text'].lower() for r in ancestors])
        test_docs['responses'].append([comments[r]['text'].lower() for r in responses])
      else:
        test_docs['ancestors'].append([comments[r]['text'] for r in ancestors])
        test_docs['responses'].append([comments[r]['text'] for r in responses])
      test_labels.append(labels)

  return train_docs, test_docs, train_labels, test_labels

In [0]:
### SARC Directory Paths ###
SARC_POL = '/content/gdrive/My Drive/SARC pol/'

In [0]:
#Load in the balanced data
balanced_train_docs, balanced_test_docs, balanced_train_labels, balanced_test_labels = load_sarc_responses(
    SARC_POL+'train-balanced.csv', SARC_POL+'test-balanced.csv', 
    SARC_POL+'comments.json', lower=False)

In [0]:
#Load in the unbalanced test data
_, unbalanced_test_docs, _, unbalanced_test_labels = load_sarc_responses(
    SARC_POL+'train-unbalanced.csv', SARC_POL+'test-unbalanced.csv', 
    SARC_POL+'comments.json', lower=False)

## Split the balanced training set to create a training set (80% of original) and a validation set (20%)

In [0]:
#Create a dataframe of the ancestors, so that each has a unique index
anc_df = pd.DataFrame(balanced_train_docs['ancestors'])

In [0]:
#Perform a random split
anc_train, anc_valid = train_test_split(anc_df, test_size = 0.2, random_state = 5)

In [0]:
#Create a dataframe for the 80% training set
resp_train = pd.DataFrame(columns = ['ancestor_index', 'response', 'label'])

for i in anc_train.index:
  resp_train = resp_train.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][0], "label": balanced_train_labels[i][0]}, ignore_index = True)
  resp_train = resp_train.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][1], "label": balanced_train_labels[i][1]}, ignore_index = True)

In [0]:
#Create a dataframe for the 20% validation set
resp_valid = pd.DataFrame(columns = ['ancestor_index', 'response', 'label'])

for i in anc_valid.index:
  resp_valid = resp_valid.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][0], "label": balanced_train_labels[i][0]}, ignore_index = True)
  resp_valid = resp_valid.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][1], "label": balanced_train_labels[i][1]}, ignore_index = True)

### Save the split training and validation sets as csv files: we will call this training set the 100% project training set. 

In [0]:
#100% project training set
resp_train.to_csv(SARC_POL+'project_data/project_training_100.csv')

In [0]:
#validation set
resp_valid.to_csv(SARC_POL+'project_data/project_validation.csv')

## Split the project training set into the smaller training sets: 50%, 25% and 12.5%

### 50% project training set

In [0]:
#Perform a random split
anc_train50, _ = train_test_split(anc_train, test_size = 0.5, random_state = 50)

In [0]:
#Create a dataframe for the 50% project training set
resp_train50 = pd.DataFrame(columns = ['ancestor_index', 'response', 'label'])

for i in anc_train50.index:
  resp_train50 = resp_train50.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][0], "label": balanced_train_labels[i][0]}, ignore_index = True)
  resp_train50 = resp_train50.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][1], "label": balanced_train_labels[i][1]}, ignore_index = True)

In [0]:
#Save to CSV
resp_train50.to_csv(SARC_POL+'project_data/project_training_50.csv')

### 25% project training set

In [0]:
#Perform a random split
anc_train25, _ = train_test_split(anc_train, test_size = 0.75, random_state = 25)

In [0]:
#Create a dataframe for the 25% project training set
resp_train25 = pd.DataFrame(columns = ['ancestor_index', 'response', 'label'])

for i in anc_train25.index:
  resp_train25 = resp_train25.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][0], "label": balanced_train_labels[i][0]}, ignore_index = True)
  resp_train25 = resp_train25.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][1], "label": balanced_train_labels[i][1]}, ignore_index = True)

In [0]:
#Save to CSV
resp_train25.to_csv(SARC_POL+'project_data/project_training_25.csv')

### 12.5% project training set

In [0]:
#Perform a random split
anc_train12, _ = train_test_split(anc_train, test_size = 0.875, random_state = 12)

In [0]:
#Create a dataframe for the 12.5% project training set
resp_train12 = pd.DataFrame(columns = ['ancestor_index', 'response', 'label'])

for i in anc_train12.index:
  resp_train12 = resp_train12.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][0], "label": balanced_train_labels[i][0]}, ignore_index = True)
  resp_train12 = resp_train12.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][1], "label": balanced_train_labels[i][1]}, ignore_index = True)

In [0]:
#Save to CSV
resp_train12.to_csv(SARC_POL+'project_data/project_training_12.csv')

### 6.25% training set

In [0]:
#Perform a random split
anc_train6, _ = train_test_split(anc_train, test_size = 0.9375, random_state = 6)

In [0]:
#Create a dataframe for the 6.25% project training set
resp_train6 = pd.DataFrame(columns = ['ancestor_index', 'response', 'label'])

for i in anc_train6.index:
  resp_train6 = resp_train6.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][0], "label": balanced_train_labels[i][0]}, ignore_index = True)
  resp_train6 = resp_train6.append({"ancestor_index": int(i), "response": balanced_train_docs['responses'][i][1], "label": balanced_train_labels[i][1]}, ignore_index = True)

In [0]:
#Save to CSV
resp_train6.to_csv(SARC_POL+'project_data/project_training_6.csv')

## Balanced test set

In [0]:
#Create a dataframe of the ancestors, so that each has a unique index
test_anc_df = pd.DataFrame(balanced_test_docs['ancestors'])

In [0]:
#Create a dataframe for the balanced test set
resp_test_balanced = pd.DataFrame(columns = ['ancestor_index', 'response', 'label'])

for i in test_anc_df.index:
  resp_test_balanced = resp_test_balanced.append({"ancestor_index": int(i), "response": balanced_test_docs['responses'][i][0], "label": balanced_test_labels[i][0]}, ignore_index = True)
  resp_test_balanced = resp_test_balanced.append({"ancestor_index": int(i), "response": balanced_test_docs['responses'][i][1], "label": balanced_test_labels[i][1]}, ignore_index = True)

In [0]:
#Save to CSV
resp_test_balanced.to_csv(SARC_POL+'project_data/balanced_test.csv')