In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
DATA_PATH = '/content/drive/MyDrive/ca.webinars/Mastering Machine Learning/Episode 2/data'
TRAIN_DATA = 'BBC News Train.csv'

In [15]:
import pandas as pd
import os
train_df = pd.read_csv(os.path.join(DATA_PATH, TRAIN_DATA))

In [5]:
train_df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [6]:
train_df[['Text', 'Category']].groupby('Category').count()

Unnamed: 0_level_0,Text
Category,Unnamed: 1_level_1
business,336
entertainment,273
politics,274
sport,346
tech,261


In [11]:
categories_list = train_df.Category.drop_duplicates().to_list()
index_list = [elem for elem in range(len(categories_list))]
my_cat_mapping = dict(zip(index_list, categories_list))

In [16]:
train_df['category_id'] = train_df['Category'].factorize()[0]

In [17]:
train_df.head()

Unnamed: 0,ArticleId,Text,Category,category_id
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,1
4,917,enron bosses in $168m payout eighteen former e...,business,0


In [26]:
train_df.Text.iloc[0]



## Creating Numerical Embedding for Text Features: TfidfVectorizer


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words='english',
    min_df = 5,
    sublinear_tf = True,
    norm = 'l2',
    ngram_range = (1, 2)

)

In [20]:
X = tfidf.fit_transform(train_df.Text).toarray()
y = train_df.category_id

In [21]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.03250994, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.03842857, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [22]:
y

0       0
1       0
2       0
3       1
4       0
       ..
1485    4
1486    4
1487    0
1488    1
1489    1
Name: category_id, Length: 1490, dtype: int64

## Fitting a Classifier

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score

models = [
          LogisticRegression(random_state=0),
          RandomForestClassifier(random_state=0, n_estimators=200, max_depth=3)
]

In [29]:
def getting_models_metrics(cv_param, models):
  cv_df = pd.DataFrame(index=range(cv_param * len(models)))
  entries = []
  for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, X, y, scoring='accuracy', cv=cv_param)
    for fold_idx, accuracy in enumerate(accuracies):
      entries.append((model_name, fold_idx, accuracy))
  cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
  return cv_df

In [30]:
cv_df = getting_models_metrics(cv_param=5, models=models)

In [32]:
cv_df.groupby('model_name').accuracy.mean()

model_name
LogisticRegression        0.975168
RandomForestClassifier    0.848322
Name: accuracy, dtype: float64

### Using best model

In [33]:
model = LogisticRegression(random_state=0)
model.fit(X,y) # X is the TFIDF matrix

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## TEST

In [34]:
TEST_FILENAME = 'BBC News Test.csv'
test_df = pd.read_csv(os.path.join(DATA_PATH, TEST_FILENAME))

In [35]:
test_df.head()

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...


In [38]:
X_test = tfidf.transform(test_df.Text.tolist()) # convert the X_test Text column into an embedding (TFIDF matrix)
y_pred = model.predict(X_test) # predict the labels based on the TFIDF matrix

In [40]:
test_df_enriched = pd.concat([test_df, pd.DataFrame(y_pred, columns=['y_pred'])], axis=1)
test_df_enriched['y_pred_label'] = test_df_enriched['y_pred'].map(my_cat_mapping)

In [41]:
test_df_enriched.head()

Unnamed: 0,ArticleId,Text,y_pred,y_pred_label
0,1018,qpr keeper day heads for preston queens park r...,3,sport
1,1319,software watching while you work software that...,1,tech
2,1138,d arcy injury adds to ireland woe gordon d arc...,3,sport
3,459,india s reliance family feud heats up the ongo...,0,business
4,1020,boro suffer morrison injury blow middlesbrough...,3,sport


In [42]:
y_true = pd.read_csv(os.path.join(DATA_PATH,'BBC News Sample Solution.csv'))
y_true.rename(columns={'Category': 'y_true_label'}, inplace=True)

In [43]:
full_test_df = test_df_enriched.merge(y_true, on='ArticleId')

In [54]:
full_test_df.head()

Unnamed: 0,ArticleId,Text,y_pred,y_pred_label,y_true_label,y_true
0,1018,qpr keeper day heads for preston queens park r...,3,sport,sport,3
1,1319,software watching while you work software that...,1,tech,tech,1
2,1138,d arcy injury adds to ireland woe gordon d arc...,3,sport,business,0
3,459,india s reliance family feud heats up the ongo...,0,business,entertainment,4
4,1020,boro suffer morrison injury blow middlesbrough...,3,sport,politics,2


In [48]:
category_id_df = train_df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')

In [50]:
category_to_id = dict(category_id_df.values)

In [53]:
full_test_df['y_true'] = full_test_df['y_true_label'].map(category_to_id)

In [56]:
from sklearn.metrics import multilabel_confusion_matrix, classification_report

In [57]:
print(classification_report(full_test_df.y_true, full_test_df.y_pred))

              precision    recall  f1-score   support

           0       0.20      0.24      0.22       147
           1       0.17      0.16      0.16       147
           2       0.16      0.16      0.16       147
           3       0.20      0.23      0.22       147
           4       0.20      0.16      0.18       147

    accuracy                           0.19       735
   macro avg       0.19      0.19      0.19       735
weighted avg       0.19      0.19      0.19       735



In [58]:
multilabel_confusion_matrix(full_test_df.y_true, full_test_df.y_pred)

array([[[446, 142],
        [111,  36]],

       [[477, 111],
        [124,  23]],

       [[467, 121],
        [124,  23]],

       [[456, 132],
        [113,  34]],

       [[498,  90],
        [124,  23]]])

**End Webinar**