In [13]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import f1_score, accuracy_score

In [15]:
np.random.seed(500)

url = 'https://raw.githubusercontent.com/Smolky/hahackathon-2021/main/datasets/hahackathon_train.csv'
train_data = pd.read_csv(url)
train_data

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.20
1,2,A man inserted an advertisement in the classif...,1,2.50,1.0,1.10
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.40
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.00
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.10
...,...,...,...,...,...,...
7995,7996,Lack of awareness of the pervasiveness of raci...,0,,,0.25
7996,7997,Why are aspirins white? Because they work sorry,1,1.33,0.0,3.85
7997,7998,"Today, we Americans celebrate our independence...",1,2.55,0.0,0.00
7998,7999,How to keep the flies off the bride at an Ital...,1,1.00,0.0,3.00


In [18]:
#Data pre-processing

nltk.download('punkt') 

def preprocess(data):
  #(1)Remove blank rows if any
  data['text'].dropna(inplace=True)

  #(2)Change all the text to lower case
  data['text'] = [entry.lower() for entry in data['text']]

  #(3)Tokenization :  each entry will be broken into set of words
  # create a new column with tokenized text
  data['tokens'] = data['text'].apply(lambda x: str(word_tokenize(x)))

preprocess(train_data)
train_data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating,tokens
0,1,tennessee: we're the best state. nobody even c...,1,2.42,1.0,0.20,"['tennessee', ':', 'we', ""'re"", 'the', 'best',..."
1,2,a man inserted an advertisement in the classif...,1,2.50,1.0,1.10,"['a', 'man', 'inserted', 'an', 'advertisement'..."
2,3,how many men does it take to open a can of bee...,1,1.95,0.0,2.40,"['how', 'many', 'men', 'does', 'it', 'take', '..."
3,4,told my mom i hit 1200 twitter followers. she ...,1,2.11,1.0,0.00,"['told', 'my', 'mom', 'i', 'hit', '1200', 'twi..."
4,5,roses are dead. love is fake. weddings are bas...,1,2.78,0.0,0.10,"['roses', 'are', 'dead', '.', 'love', 'is', 'f..."
...,...,...,...,...,...,...,...
7995,7996,lack of awareness of the pervasiveness of raci...,0,,,0.25,"['lack', 'of', 'awareness', 'of', 'the', 'perv..."
7996,7997,why are aspirins white? because they work sorry,1,1.33,0.0,3.85,"['why', 'are', 'aspirins', 'white', '?', 'beca..."
7997,7998,"today, we americans celebrate our independence...",1,2.55,0.0,0.00,"['today', ',', 'we', 'americans', 'celebrate',..."
7998,7999,how to keep the flies off the bride at an ital...,1,1.00,0.0,3.00,"['how', 'to', 'keep', 'the', 'flies', 'off', '..."


In [19]:
test_data = pd.read_csv('https://raw.githubusercontent.com/Smolky/hahackathon-2021/main/datasets/hahackathon_test.csv')
preprocess(test_data)
test_data

Unnamed: 0,id,text,tokens
0,9001,finding out your ex got fat is like finding 20...,"['finding', 'out', 'your', 'ex', 'got', 'fat',..."
1,9002,"for brockmann, stereotypes imperil national se...","['for', 'brockmann', ',', 'stereotypes', 'impe..."
2,9003,a girl runs up to her mother with a pile of cr...,"['a', 'girl', 'runs', 'up', 'to', 'her', 'moth..."
3,9004,gotta wonder if baseball still would've been c...,"['got', 'ta', 'wonder', 'if', 'baseball', 'sti..."
4,9005,when you're dreading getting in the shower cuz...,"['when', 'you', ""'re"", 'dreading', 'getting', ..."
...,...,...,...
995,9996,what do you call a black man on the moon? an a...,"['what', 'do', 'you', 'call', 'a', 'black', 'm..."
996,9997,when im picking someone up and they ask how lo...,"['when', 'im', 'picking', 'someone', 'up', 'an..."
997,9998,"a black lesbian, an obese white neck-beard, an...","['a', 'black', 'lesbian', ',', 'an', 'obese', ..."
998,9999,and i recognize the need to use all of my plat...,"['and', 'i', 'recognize', 'the', 'need', 'to',..."


In [33]:
tfidf_vect = TfidfVectorizer(use_idf=False, max_features=8000)
tfidf_vect.fit(train_data['tokens'])

train_text_tfidf = tfidf_vect.transform(train_data)
test_text_tfidf = tfidf_vect.transform(test_data)

train_text_tfidf.shape

(7, 8000)

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_data['tokens'])
test_counts = count_vect.transform(test_data['tokens'])


train_counts.shape

(8000, 14509)

In [39]:
def train(text_rep, labels):
  # define a SVM model
  model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
  model.fit(text_rep, labels)
  return model

def test(model, text_rep):
  # predict the labels on validation dataset
  preds = model.predict(text_rep)
  return preds

humor_model = train(train_counts, train_data['is_humor'])
humor_preds = test(humor_model, test_counts)

test_data['humor_predictions'] = humor_preds
test_data

Unnamed: 0,id,text,tokens,humor_predictions
0,9001,finding out your ex got fat is like finding 20...,"['finding', 'out', 'your', 'ex', 'got', 'fat',...",1
1,9002,"for brockmann, stereotypes imperil national se...","['for', 'brockmann', ',', 'stereotypes', 'impe...",0
2,9003,a girl runs up to her mother with a pile of cr...,"['a', 'girl', 'runs', 'up', 'to', 'her', 'moth...",1
3,9004,gotta wonder if baseball still would've been c...,"['got', 'ta', 'wonder', 'if', 'baseball', 'sti...",0
4,9005,when you're dreading getting in the shower cuz...,"['when', 'you', ""'re"", 'dreading', 'getting', ...",1
...,...,...,...,...
995,9996,what do you call a black man on the moon? an a...,"['what', 'do', 'you', 'call', 'a', 'black', 'm...",1
996,9997,when im picking someone up and they ask how lo...,"['when', 'im', 'picking', 'someone', 'up', 'an...",1
997,9998,"a black lesbian, an obese white neck-beard, an...","['a', 'black', 'lesbian', ',', 'an', 'obese', ...",1
998,9999,and i recognize the need to use all of my plat...,"['and', 'i', 'recognize', 'the', 'need', 'to',...",0


In [42]:
train_data.fillna(0, inplace=True)
controversy_model = train(train_counts, train_data['humor_controversy'])
controversy_preds = test(controversy_model, test_counts)

test_data['controversy_predictions'] = controversy_preds
test_data

Unnamed: 0,id,text,tokens,humor_predictions,controversy_predictions
0,9001,finding out your ex got fat is like finding 20...,"['finding', 'out', 'your', 'ex', 'got', 'fat',...",1,1.0
1,9002,"for brockmann, stereotypes imperil national se...","['for', 'brockmann', ',', 'stereotypes', 'impe...",0,0.0
2,9003,a girl runs up to her mother with a pile of cr...,"['a', 'girl', 'runs', 'up', 'to', 'her', 'moth...",1,0.0
3,9004,gotta wonder if baseball still would've been c...,"['got', 'ta', 'wonder', 'if', 'baseball', 'sti...",0,0.0
4,9005,when you're dreading getting in the shower cuz...,"['when', 'you', ""'re"", 'dreading', 'getting', ...",1,0.0
...,...,...,...,...,...
995,9996,what do you call a black man on the moon? an a...,"['what', 'do', 'you', 'call', 'a', 'black', 'm...",1,0.0
996,9997,when im picking someone up and they ask how lo...,"['when', 'im', 'picking', 'someone', 'up', 'an...",1,1.0
997,9998,"a black lesbian, an obese white neck-beard, an...","['a', 'black', 'lesbian', ',', 'an', 'obese', ...",1,1.0
998,9999,and i recognize the need to use all of my plat...,"['and', 'i', 'recognize', 'the', 'need', 'to',...",0,0.0


In [45]:
test_data.to_csv("./svm_results.csv")