# **Handling Imbalance Data**

In [18]:
#import library
import pandas as pd
import numpy as np

#machine learning library
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [3]:
# human capital
human_cap = pd.read_csv("https://raw.githubusercontent.com/densaiko/data_science_learning/main/dataset/Human%20Capital.csv")
print("Diabetes data contain {} rows and {} columns \n".format(human_cap.shape[0], human_cap.shape[1]))
print(human_cap.info(), '\n')
human_cap.describe()

Diabetes data contain 54808 rows and 13 columns 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  awards_won            54808 non-null  int64  
 11  avg_training_score    52248 non-null  float64
 12  is_promoted           54808 non-null  int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 5.4+ MB
None 



Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,50684.0,54808.0,54808.0,52248.0,54808.0
mean,39195.830627,1.253011,34.803915,3.329256,5.865512,0.023172,63.712238,0.08517
std,22586.581449,0.609264,7.660169,1.259993,4.265094,0.15045,13.52191,0.279137
min,1.0,1.0,20.0,1.0,1.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,0.0,77.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,99.0,1.0


In [8]:
def label_encoding(data):
  """
  Let's encode the non_numerical data into a numerical value
  data: your dataset
  """

  data_new = data.copy()
  data_new = data_new.dropna()

  #find non numerical variable/field
  object_data = data_new.select_dtypes(include=['object']).columns.to_list()

  #label encoder object
  label_encoding = LabelEncoder()

  #encode the data into a label
  for i in object_data:
    data_new[i] = label_encoding.fit_transform(data_new[i])

  return data_new

In [9]:
new_human_cap = label_encoding(human_cap)
new_human_cap.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
0,65438,7,31,2,0,2,1,35,5.0,8,0,49.0,0
1,65141,4,14,0,1,0,1,30,5.0,4,0,60.0,0
2,7513,7,10,0,1,2,1,34,3.0,7,0,50.0,0
3,2542,7,15,0,1,0,2,39,1.0,10,0,50.0,0
4,48945,8,18,0,1,0,1,45,3.0,2,0,73.0,0


In [12]:
X = new_human_cap.drop('is_promoted', axis=1)
y = new_human_cap['is_promoted']

#implement train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [15]:
def modelling(X_train, y_train, X_test, y_test):

  # modelling with logistic regression
  clf = GradientBoostingClassifier()
  clf.fit(X_train, y_train)

  # Evaluation
  y_predict_train = clf.predict(X_train)
  y_predict_test = clf.predict(X_test)

  training_acc = accuracy_score(y_train, y_predict_train)
  testing_acc = accuracy_score(y_test, y_predict_test)

  print("Training Accuracy: {}".format(training_acc))
  print("Testing Accuracy: {}".format(testing_acc))

In [16]:
modelling(X_train, y_train, X_test, y_test)

Training Accuracy: 0.9356673134971971
Testing Accuracy: 0.9379042690815006


## **Undersampling**

In [19]:
#set the undersampling
undersample = RandomUnderSampler(sampling_strategy=0.5) #set your strategy

#fit the data
X_under, y_under = undersample.fit_resample(X_train, y_train)

print(Counter(y_train))
print(Counter(y_under))

Counter({0: 33805, 1: 3299})
Counter({0: 6598, 1: 3299})


In [20]:
modelling(X_under, y_under, X_test, y_test)

Training Accuracy: 0.7971102354248762
Testing Accuracy: 0.9007115135834411


## **Oversampling**

In [21]:
#set the oversampling
oversample = RandomOverSampler(sampling_strategy=0.5) #set your strategy

#fit the data
X_over, y_over = oversample.fit_resample(X_train, y_train)

print(Counter(y_train))
print(Counter(y_over))

Counter({0: 33805, 1: 3299})
Counter({0: 33805, 1: 16902})


In [22]:
modelling(X_over, y_over, X_test, y_test)

Training Accuracy: 0.7968525055712229
Testing Accuracy: 0.9013583441138422


## **SMOTE**

In [23]:
# Handling imbalance data
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=43, sampling_strategy=0.5)

# Fit the over sampling
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

print("Before over sampling: {}".format(Counter(y_train)))
print("After over sampling: {}".format(Counter(y_train_smote)))

Before over sampling: Counter({0: 33805, 1: 3299})
After over sampling: Counter({0: 33805, 1: 16902})


In [24]:
modelling(X_train_smote, y_train_smote, X_test, y_test)

Training Accuracy: 0.9120634231960084
Testing Accuracy: 0.9223803363518758


# **Text Pre-Processing**

In [1]:
# news dataset
news_data = pd.read_excel("/content/drive/MyDrive/Data Ari Gmail/Data_Ari/Data_Science/Digital Skola/Indonesia Version/Data Preprocessing II/News Title.xls", sheet_name='Data Train')
news_data.head()

NameError: ignored