# Imbalanced Dataset

Testing for 2 different scenarios to check the impact of imbalanced datasets

- Multiclass classification with large size text documents (News Articles)
- Binary classification with small text documents (messages)

### Importing required libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score, precision_score
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Defining all the functions

In [2]:
def create_imbalance(dataset, var_name, var_category='business', sample_percentage = 10):
    category_articles = dataset[dataset[var_name] == var_category].copy()
    dataset = dataset[train_X[var_name] != var_category].copy()
    sample_size = int(category_articles.shape[0]*10/100)
    category_articles = category_articles.sample(sample_size)
    dataset = pd.concat([dataset, category_articles])
    return dataset

In [3]:
def mapping_categories(dataset, var_categories):
    category_map = {cat:i for i,cat in enumerate(var_categories)}
    dataset = dataset.map(category_map)
    return dataset

# Testing the individual run

### Importing dataset and converting the target variable which is currently text in to numeric categories

In [4]:
inp_dataset = pd.read_csv("C:\\Ujjwal\\Analytics\\Datasets\\News Classification\\News_train.csv")
inp_dataset.head(2)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business


In [5]:
### Checking the distribution of articles across categories
inp_dataset.Category.value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

### Splitting data in to test and train dataset

In [6]:
train_X, test_X, train_y, test_y = train_test_split(inp_dataset, inp_dataset["Category"], random_state = 8)

In [7]:
#### Generating Imbalance 
train_X = create_imbalance(train_X, var_name="Category", var_category="business", sample_percentage=5)
train_y = train_X["Category"].copy()
train_X.Category.value_counts()

sport            261
entertainment    211
politics         208
tech             192
business          24
Name: Category, dtype: int64

In [8]:
#### Mapping category to numerical equivalents
categories = train_y.unique()
train_y = mapping_categories(train_y, var_categories=categories)
test_y = mapping_categories(test_y, var_categories=categories)

### Creating a count verctor for both test and train data

In [9]:
Cnt_Vec = CountVectorizer(stop_words="english")
train_data = Cnt_Vec.fit_transform(train_X['Text']).toarray()
test_data = Cnt_Vec.transform(test_X['Text']).toarray()

### Building the model and checking the accuracy on test data

In [11]:
alpha = .01
Muti_NB = MultinomialNB(alpha=alpha)
Muti_NB.fit(train_data, train_y)
pred = Muti_NB.predict(test_data)
acc = accuracy_score(test_y,pred)

In [133]:
pd.crosstab(pred,test_y)

Category,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,85,0,0,0,0
2,0,61,1,0,2
3,0,0,64,0,33
4,0,1,1,69,12
5,0,0,0,0,44
