In [24]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Evaluation function

In [4]:
# from Rob Mulla @robikscube
# https://www.kaggle.com/robikscube/student-writing-competition-twitch
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(" "))
    set_gt = set(row.predictionstring_gt.split(" "))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return [overlap_1, overlap_2]

def score_feedback_comp_micro(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = (
        gt_df[["id", "discourse_type", "predictionstring"]]
        .reset_index(drop=True)
        .copy()
    )
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    pred_df["pred_id"] = pred_df.index
    gt_df["gt_id"] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(
        gt_df,
        left_on=["id", "class"],
        right_on=["id", "discourse_type"],
        how="outer",
        suffixes=("_pred", "_gt"),
    )
    joined["predictionstring_gt"] = joined["predictionstring_gt"].fillna(" ")
    joined["predictionstring_pred"] = joined["predictionstring_pred"].fillna(" ")

    joined["overlaps"] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined["overlap1"] = joined["overlaps"].apply(lambda x: eval(str(x))[0])
    joined["overlap2"] = joined["overlaps"].apply(lambda x: eval(str(x))[1])

    joined["potential_TP"] = (joined["overlap1"] >= 0.5) & (joined["overlap2"] >= 0.5)
    joined["max_overlap"] = joined[["overlap1", "overlap2"]].max(axis=1)
    tp_pred_ids = (
        joined.query("potential_TP")
        .sort_values("max_overlap", ascending=False)
        .groupby(["id", "predictionstring_gt"])
        .first()["pred_id"]
        .values
    )

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined["pred_id"].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query("potential_TP")["gt_id"].unique()
    unmatched_gt_ids = [c for c in joined["gt_id"].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    # calc microf1
    my_f1_score = TP / (TP + 0.5 * (FP + FN))
    return my_f1_score

def score_feedback_comp(pred_df, gt_df, return_class_scores=False):
    class_scores = {}
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    for discourse_type, gt_subset in gt_df.groupby("discourse_type"):
        pred_subset = (
            pred_df.loc[pred_df["class"] == discourse_type]
            .reset_index(drop=True)
            .copy()
        )
        class_score = score_feedback_comp_micro(pred_subset, gt_subset)
        class_scores[discourse_type] = class_score
    f1 = np.mean([v for v in class_scores.values()])
    if return_class_scores:
        return f1, class_scores
    return f1

# Data location

In [5]:
%store -r student_writing_output_data_s3_uri
student_writing_output_data_s3_uri

's3://kaggle-writing-student/processed_dataset/'

# Train model Ad-Hoc

In [18]:
!aws s3 cp s3://kaggle-writing-student/processed_dataset/train.csv .
!aws s3 cp s3://kaggle-writing-student/processed_dataset/valid.csv .

download: s3://kaggle-writing-student/processed_dataset/train.csv to ./train.csv
download: s3://kaggle-writing-student/processed_dataset/valid.csv to ./valid.csv


In [19]:
train_df = pd.read_csv('train.csv')

In [8]:
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df.discourse_type.to_numpy())
# valid_labels_encoded = label_encoder.transform(valid_df.discourse_type.to_numpy())
# test_labels_encoded = label_encoder.transform(test_df.discourse_type.to_numpy())

train_labels_encoded, train_labels_encoded.shape, type(train_labels_encoded)

(array([4, 5, 3, ..., 0, 3, 1]), (115441,), numpy.ndarray)

In [9]:
label_encoder.classes_

array(['Claim', 'Concluding Statement', 'Counterclaim', 'Evidence',
       'Lead', 'Position', 'Rebuttal'], dtype=object)

In [34]:
modelNB = Pipeline([
  ("tf-idf", TfidfVectorizer()),
  ("clf", MultinomialNB())
])

In [35]:
modelNB.fit(train_df.discourse_text, train_labels_encoded)

Pipeline(memory=None,
         steps=[('tf-idf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [None]:
valid_df = pd.read_csv('valid.csv')

In [36]:
modelNB.predict(valid_df.discourse_text)

array([3, 0, 0, ..., 0, 0, 3])

In [37]:
pred_df = valid_df.copy()
# Replace column with predictions
pred_df.discourse_type = pd.Series(data=(list(map(lambda x: label_encoder.classes_[x], modelNB.predict(valid_df.discourse_text))))).values

pred_df = pred_df[['id','discourse_type','predictionstring']].copy()
pred_df.columns = ['id','class','predictionstring']

score_feedback_comp(pred_df, valid_df, return_class_scores=True)

(0.1695693265081649,
 {'Claim': 0.5690787291901572,
  'Concluding Statement': 0.0,
  'Counterclaim': 0.0,
  'Evidence': 0.5876798561151079,
  'Lead': 0.0,
  'Position': 0.030226700251889168,
  'Rebuttal': 0.0})

# Train model with Script

In [9]:
import sagemaker
import boto3
import os

sess = sagemaker.Session()
sm_bucket = sess.default_bucket()
                     
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3 = boto3.Session().client(service_name="s3", region_name=region)

In [13]:
# https://aws.amazon.com/sagemaker/pricing/
train_instance_type = "ml.m5.large"
train_instance_count = 1

In [14]:
from sagemaker.sklearn import SKLearn

estimator = SKLearn(
    entry_point="train_script.py",
    framework_version="0.23-1",
    instance_count=train_instance_count,
    instance_type=train_instance_type,
    role=role,
    hyperparameters={"alpha": 1.0, "fit_prior": True},
)

In [15]:
%store -r student_writing_output_data_s3_uri
student_writing_output_data_s3_uri

's3://kaggle-writing-student/processed_dataset/'

In [16]:
s3_input_train_data = os.path.join(student_writing_output_data_s3_uri, 'train.csv')
s3_input_train_data

's3://kaggle-writing-student/processed_dataset/train.csv'

In [17]:
estimator.fit(
    inputs={"train": s3_input_train_data},
    wait=False,
)

In [18]:
training_job_name = estimator.latest_training_job.name
print("Training Job Name:  {}".format(training_job_name))

Training Job Name:  sagemaker-scikit-learn-2022-02-23-16-02-46-735


In [19]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(
            region, training_job_name
        )
    )
)

In [22]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, training_job_name
        )
    )
)

In [21]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(
            bucket, training_job_name, region
        )
    )
)

In [None]:
training_job_name

In [None]:
sm_bucket

In [None]:
!aws s3 cp s3://$sm_bucket/$training_job_name/output/model.tar.gz s3://kaggle-writing-student/models/nb_model.tar.gz

In [None]:
!aws s3 cp s3://$sm_bucket/$training_job_name/output/model.tar.gz s3://kaggle-writing-student/models/nb_model.tar.gz