In [None]:
!pip install nltk
!pip install numpy matplotlib
!pip install pandas
!pip install gensim
!pip install sklearn



# Overview
## 1. Metrics for Text Classification

## 2. Example: Text Classification with LogisticRegression

## 3. Exercise: Hate-speech Classification 


In [None]:
import pandas as pd

# Here we use 20-Newsgroups dataset (http://qwone.com/~jason/20Newsgroups/) for this example. 
# This version of the dataset contains about 11k newsgroups posts from 20 different topics. 
# This is available as https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json

raw_data = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(raw_data.target_names.unique())

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


In [None]:
raw_data

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space
...,...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13,sci.med
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4,comp.sys.mac.hardware
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3,comp.sys.ibm.pc.hardware
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1,comp.graphics


In [None]:
# Read the text for classification
text = []
for i in range(0, len(raw_data['content'])):
  text.append(raw_data['content'][i])

In [None]:
# Read the labels 
labels = []
for i in range(0, len(raw_data['target'])):
  labels.append(raw_data['target'][i])

# Metrics for Text Classification

In this section, we will go through how to compute different metrics for classification tasks with sklearn. 

Basically, you need to input a list of predicted lables and the ground truth, and the built-in function will return the calculated results

## Accuracy: 

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred = [0, 2, 1, 3]
y_true = [0, 1, 2, 3]

In [None]:
accuracy_score(y_true, y_pred)

0.5

## Precission, Recall and F1

Note: Macro vs Micro:

Macro: Calculate metrics for each label, and find their unweighted mean.

Micro: Calculate metrics globally by counting the total true positives, false negatives and false positives.

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 0, 1]

In [None]:
precision_score(y_true, y_pred, average='macro')

0.2222222222222222

In [None]:
precision_score(y_true, y_pred, average='micro')

0.3333333333333333

In [None]:
recall_score(y_true, y_pred, average='macro')

0.3333333333333333

In [None]:
recall_score(y_true, y_pred, average='micro')

0.3333333333333333

In [None]:
f1_score(y_true, y_pred, average='macro')

0.26666666666666666

In [None]:
f1_score(y_true, y_pred, average='micro')

0.3333333333333333

# Example: Text Classification with LogisticRegression

In this section we will go through the process to perform text classification with LogisticRegression using sklearn.

Basically, we need first extract text representation with sklearn and then utilize built-in model in sklearn to learn the classification model.

Here let's use the 20-newsgroup data as an example.

## Text Representation

The first step is to represent text with vectors/features such as bag-of-words, tf-idf features.  Here we will use extract tf-idf features as an example. More features could be found here: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

In [None]:
features = tfidf.fit_transform(text)

In [None]:
features.shape

(11314, 1186545)

## Learning the Classification Model

There are many different models in sklearn such as Naive Bayes Classifier, Logistic Regression Classifier, Linear Support Vector Machine, and etc. Here we use Logistic Regression Classifier as an example. More details cound be found here: https://scikit-learn.org/stable/supervised_learning.html#supervised-learning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# We first split the original data into train and test set
X_train, X_test, y_train, y_test = train_test_split(text, labels, random_state = 0)

# Extract features
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
tfidf.fit(X_train)
X_train_features = tfidf.transform(X_train)

X_test_features = tfidf.transform(X_test)

In [None]:
# Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
# Linear Support Vector Machine
from sklearn.svm import LinearSVC

clf = LogisticRegression().fit(X_train_features, y_train)

## Evaluating the Classification Model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# predict labels for test data
y_pred = clf.predict(X_test_features)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

accuracy_score(y_test, y_pred)

0.9020855425945564

In [None]:
f1_score(y_test, y_pred, average='macro')

0.8999629673074958

In [None]:
f1_score(y_test, y_pred, average='micro')

0.9020855425945564

# Exercise: Text Classification

1. You could first follow the example above and apply the codes to the hate-speech classification task. 


2. You could also try different [feature extractors](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text) in sklearn and different classification model like `MultinomialNB` and `LinearSVC`. Will thses different combinations show better performances?  (Actually no single model/feature perform the best for all the tasks/datasets. You will need to explore different combinations to select the best methods for your own task in the future.)

3. Based on the example above and word embeddings leaned from previous lecture, could you try to build a Logistic Regression Classifier with embeddings as the input feature? Will you achieve better performances than tf-idf input features?(Hint: To transform the input text to vector representations/features, you could refer to the codes we did in text clustering section.)


***WARNING: The data, lexicons, and notebooks all contain content that is racist, sexist, homophobic, and offensive in many other ways. ***



First, let's download the dataset first.

The dataset is from `Thomas Davidson, Dana Warmsley, Michael Macy, and Ingmar Weber. 2017. "Automated Hate Speech Detection and the Problem of Offensive Language." ICWSM`

More details could be found here: https://github.com/t-davidson/hate-speech-and-offensive-language

In [47]:
!git clone https://github.com/t-davidson/hate-speech-and-offensive-language.git

Cloning into 'hate-speech-and-offensive-language'...
remote: Enumerating objects: 32, done.[K
remote: Total 32 (delta 0), reused 0 (delta 0), pack-reused 32[K
Unpacking objects: 100% (32/32), done.


In [48]:
import pandas as pd
raw_data = pd.read_csv('./hate-speech-and-offensive-language/data/labeled_data.csv')

In [49]:
raw_data

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


*count* = number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were determined to be unreliable by CF).

*hate_speech* = number of CF users who judged the tweet to be hate speech.

*offensive_language* = number of CF users who judged the tweet to be offensive.

*neither* = number of CF users who judged the tweet to be neither offensive nor non-offensive.

*class* = class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither

In [51]:
# Read the text for classification
text = []
for i in range(0, len(raw_data['tweet'])):
  text.append(raw_data['tweet'][i])

# Read the labels 
labels = []
for i in range(0, len(raw_data['class'])):
  labels.append(raw_data['class'][i])