In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_pickle("../input/traintest_2019NOV_RS_UNITED.pkl")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5644 entries, 0 to 5643
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   article_id       5644 non-null   object
 1   link             5644 non-null   object
 2   headline         5644 non-null   object
 3   contents         5644 non-null   object
 4   annotation       5644 non-null   object
 5   done_by          5644 non-null   object
 6   risk_cat         5644 non-null   object
 7   annotation_note  5644 non-null   object
dtypes: object(8)
memory usage: 396.8+ KB


# Data Processing

In [4]:
import nltk
import nltk.corpus

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
import re
import string
PUNCT_TO_REMOVE = string.punctuation

from nltk import word_tokenize
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words("english"))

def text_processing(text):
    text = text.lower()
    text = re.compile(r'https?://\S+|www\.\S+').sub(r'', text)
    text = text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
    text = " ".join([word for word in str(text).split() if word not in STOPWORDS])
    text = " ".join([stemmer.stem(word) for word in text.split()])
    return text


In [8]:
df['category'] = df['risk_cat'].apply(lambda x: x if x in ['Not Risk','Others'] else 'Risk')
df['cat_code_full'] = df['risk_cat'].astype('category').cat.codes
df['cat_code'] = df['category'].astype('category').cat.codes
df['cleaned_content'] = df['contents'].apply(lambda x:text_processing(x))
df['cleaned_headline'] = df['headline'].apply(lambda x:text_processing(x))

# TFIDF

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['cleaned_content','cleaned_headline']], df['cat_code'], 
                                                    test_size=0.2,shuffle=True,random_state = 42)
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(df[['cleaned_content','cleaned_headline']], df['cat_code_full'], 
                                                    test_size=0.2,shuffle=True,random_state = 42)

## 3 categories

In [24]:
tfidf_content = TfidfVectorizer()
tfidf_content.fit(X_train['cleaned_content'])  
content_train = tfidf_content.transform(X_train['cleaned_content']).toarray()
content_test = tfidf_content.transform(X_test['cleaned_content']).toarray()

tfidf_headline = TfidfVectorizer()
tfidf_headline.fit(X_train['cleaned_headline'])  
headline_train = tfidf_headline.transform(X_train['cleaned_headline']).toarray()
headline_test = tfidf_headline.transform(X_test['cleaned_headline']).toarray()

X_train =  np.hstack((content_train,headline_train))
X_test =  np.hstack((content_test,headline_test))

## Full categories

In [25]:
tfidf_content_full = TfidfVectorizer()
tfidf_content_full.fit(X_train_full['cleaned_content'])  
content_train_full = tfidf_content_full.transform(X_train_full['cleaned_content']).toarray()
content_test_full = tfidf_content_full.transform(X_test_full['cleaned_content']).toarray()

tfidf_headline_full = TfidfVectorizer()
tfidf_headline_full.fit(X_train_full['cleaned_headline'])  
headline_train_full = tfidf_headline_full.transform(X_train_full['cleaned_headline']).toarray()
headline_test_full = tfidf_headline_full.transform(X_test_full['cleaned_headline']).toarray()

X_train_full =  np.hstack((content_train_full,headline_train_full))
X_test_full =  np.hstack((content_test_full,headline_test_full))

# Model
## Logistic Regression

In [26]:
from sklearn import linear_model

lr1 = linear_model.LogisticRegression(random_state=123,penalty='l2', solver='lbfgs')
lr1.fit(X_train, y_train)

LogisticRegression(random_state=123)

In [27]:
print('Classification Report(3 categories):\n',classification_report(y_test, lr1.predict(X_test)))

Classification Report(3 categories):
               precision    recall  f1-score   support

           0       0.77      0.64      0.70       253
           1       0.65      0.44      0.53       189
           2       0.77      0.89      0.83       687

    accuracy                           0.76      1129
   macro avg       0.73      0.66      0.68      1129
weighted avg       0.75      0.76      0.75      1129



In [28]:
lr2 = linear_model.LogisticRegression(random_state=123,penalty='l2', solver='lbfgs')
lr2.fit(X_train_full, y_train_full)
print('Classification Report(full categories):\n',classification_report(y_test_full, lr2.predict(X_test_full)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report(full categories):
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.84      0.70      0.76       103
           2       0.67      0.15      0.25        13
           3       1.00      0.61      0.76        18
           4       0.68      0.57      0.62       164
           5       0.71      0.42      0.53        12
           6       0.67      0.38      0.49        26
           7       1.00      0.04      0.07        26
           8       0.68      0.34      0.45        44
           9       0.00      0.00      0.00         1
          10       1.00      0.20      0.33         5
          11       1.00      0.43      0.60         7
          12       0.61      0.82      0.70       253
          13       0.92      0.58      0.71        19
          14       0.52      0.70      0.60       189
          15       0.77      0.65      0.71        63
          16       0.80      0.90      0

  _warn_prf(average, modifier, msg_start, len(result))


## Xgboost

In [29]:
import xgboost as xgb

xgb1 = xgb.XGBClassifier(n_estimators = 300, learning_rate = 0.1,random_state = 42)
xgb1.fit(X_train,y_train)
print('Classification Report(3 categories):\n',classification_report(y_test, xgb1.predict(X_test)))



Classification Report(3 categories):
               precision    recall  f1-score   support

           0       0.78      0.60      0.68       253
           1       0.59      0.43      0.50       189
           2       0.77      0.89      0.83       687

    accuracy                           0.75      1129
   macro avg       0.71      0.64      0.67      1129
weighted avg       0.74      0.75      0.74      1129



In [31]:
xgb2 = xgb.XGBClassifier(n_estimators = 200, learning_rate = 0.1,random_state = 42)
xgb2.fit(X_train_full,y_train_full)
print('Classification Report(full categories):\n',classification_report(y_test_full, xgb2.predict(X_test_full)))

Classification Report(full categories):
               precision    recall  f1-score   support

           0       1.00      0.14      0.25         7
           1       0.82      0.67      0.74       103
           2       0.33      0.15      0.21        13
           3       0.93      0.72      0.81        18
           4       0.63      0.60      0.61       164
           5       0.71      0.42      0.53        12
           6       0.65      0.50      0.57        26
           7       0.60      0.23      0.33        26
           8       0.50      0.18      0.27        44
           9       0.00      0.00      0.00         1
          10       0.50      0.20      0.29         5
          11       0.67      0.57      0.62         7
          12       0.61      0.77      0.68       253
          13       0.91      0.53      0.67        19
          14       0.50      0.67      0.57       189
          15       0.69      0.70      0.69        63
          16       0.82      0.93      0

  _warn_prf(average, modifier, msg_start, len(result))
