In [None]:
#import library
import pandas as pd
import numpy as np

#machine learning library
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

#wordcloud library
from wordcloud import WordCloud, STOPWORDS
import datetime, nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# **Handling Imbalance Data**

In [None]:
# human capital
human_cap = pd.read_csv("https://raw.githubusercontent.com/densaiko/data_science_learning/main/dataset/Human%20Capital.csv")
print("Diabetes data contain {} rows and {} columns \n".format(human_cap.shape[0], human_cap.shape[1]))
print(human_cap.info(), '\n')
human_cap.describe()

Diabetes data contain 54808 rows and 13 columns 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  awards_won            54808 non-null  int64  
 11  avg_training_score    52248 non-null  float64
 12  is_promoted           54808 non-null  int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 5.4+ MB
None 



Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,50684.0,54808.0,54808.0,52248.0,54808.0
mean,39195.830627,1.253011,34.803915,3.329256,5.865512,0.023172,63.712238,0.08517
std,22586.581449,0.609264,7.660169,1.259993,4.265094,0.15045,13.52191,0.279137
min,1.0,1.0,20.0,1.0,1.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,0.0,77.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,99.0,1.0


In [None]:
def label_encoding(data):
  """
  Let's encode the non_numerical data into a numerical value
  data: your dataset
  """

  data_new = data.copy()
  data_new = data_new.dropna()

  #find non numerical variable/field
  object_data = data_new.select_dtypes(include=['object']).columns.to_list()

  #label encoder object
  label_encoding = LabelEncoder()

  #encode the data into a label
  for i in object_data:
    data_new[i] = label_encoding.fit_transform(data_new[i])

  return data_new

In [None]:
new_human_cap = label_encoding(human_cap)
new_human_cap.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
0,65438,7,31,2,0,2,1,35,5.0,8,0,49.0,0
1,65141,4,14,0,1,0,1,30,5.0,4,0,60.0,0
2,7513,7,10,0,1,2,1,34,3.0,7,0,50.0,0
3,2542,7,15,0,1,0,2,39,1.0,10,0,50.0,0
4,48945,8,18,0,1,0,1,45,3.0,2,0,73.0,0


In [None]:
X = new_human_cap.drop('is_promoted', axis=1)
y = new_human_cap['is_promoted']

#implement train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [None]:
def modelling(X_train, y_train, X_test, y_test):

  # modelling with logistic regression
  clf = GradientBoostingClassifier()
  clf.fit(X_train, y_train)

  # Evaluation
  y_predict_train = clf.predict(X_train)
  y_predict_test = clf.predict(X_test)

  training_acc = accuracy_score(y_train, y_predict_train)
  testing_acc = accuracy_score(y_test, y_predict_test)

  print("Training Accuracy: {}".format(training_acc))
  print("Testing Accuracy: {}".format(testing_acc))

In [None]:
modelling(X_train, y_train, X_test, y_test)

Training Accuracy: 0.9356673134971971
Testing Accuracy: 0.9379042690815006


## **Undersampling**

In [None]:
#set the undersampling
undersample = RandomUnderSampler(sampling_strategy=0.5) #set your strategy

#fit the data
X_under, y_under = undersample.fit_resample(X_train, y_train)

print(Counter(y_train))
print(Counter(y_under))

Counter({0: 33805, 1: 3299})
Counter({0: 6598, 1: 3299})


In [None]:
modelling(X_under, y_under, X_test, y_test)

Training Accuracy: 0.7971102354248762
Testing Accuracy: 0.9007115135834411


## **Oversampling**

In [None]:
#set the oversampling
oversample = RandomOverSampler(sampling_strategy=0.5) #set your strategy

#fit the data
X_over, y_over = oversample.fit_resample(X_train, y_train)

print(Counter(y_train))
print(Counter(y_over))

Counter({0: 33805, 1: 3299})
Counter({0: 33805, 1: 16902})


In [None]:
modelling(X_over, y_over, X_test, y_test)

Training Accuracy: 0.7968525055712229
Testing Accuracy: 0.9013583441138422


## **SMOTE**

In [None]:
# Handling imbalance data
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=43, sampling_strategy=0.5)

# Fit the over sampling
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

print("Before over sampling: {}".format(Counter(y_train)))
print("After over sampling: {}".format(Counter(y_train_smote)))

Before over sampling: Counter({0: 33805, 1: 3299})
After over sampling: Counter({0: 33805, 1: 16902})


In [None]:
modelling(X_train_smote, y_train_smote, X_test, y_test)

Training Accuracy: 0.9120634231960084
Testing Accuracy: 0.9223803363518758




---



# **Text Pre-Processing**

In [None]:
# news dataset
news_data = pd.read_csv("https://raw.githubusercontent.com/frfusch21/Data-Science-Modul/master/Datasets/News%20Title.csv",sep=";",header=0,index_col=0)
news_data.head()

Unnamed: 0_level_0,News Title,Category
No,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Google+ rolls out 'Stories' for tricked out ph...,Technology
2,Dov Charney's Redeeming Quality,Business
3,White God adds Un Certain Regard to the Palm Dog,Entertainment
4,"Google shows off Androids for wearables, cars,...",Technology
5,China May new bank loans at 870.8 bln yuan,Business


In [None]:
print("News data contain {} rows and {} columns \n".format(news_data.shape[0], news_data.shape[1]))
print(news_data.info(), '\n')
news_data.describe()

News data contain 65535 rows and 2 columns 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65535 entries, 1 to 65535
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   News Title  65535 non-null  object
 1   Category    65535 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB
None 



Unnamed: 0,News Title,Category
count,65535,65535
unique,64981,4
top,The article requested cannot be found! Please ...,Entertainment
freq,21,23961


##**Data Cleansing**


###Removing Punctuations

In [None]:
#list of punctuations
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# example of taking out punctuations
nopunc = [char for char in news_data['News Title'][1] if char not in string.punctuation]
nopunc = ''.join(nopunc)
nopunc = nopunc.lower()

print("Before: {} \n After: {}".format(news_data['News Title'][1], nopunc))

Before: Google+ rolls out 'Stories' for tricked out photo playback 
 After: google rolls out stories for tricked out photo playback


###Tokenization


In [None]:
# example tokenization
word_tokens = word_tokenize(nopunc)
word_tokens

['google',
 'rolls',
 'out',
 'stories',
 'for',
 'tricked',
 'out',
 'photo',
 'playback']

###Removing Stopwords

In [None]:
# example implementation of stopwords
stop_words = set(stopwords.words('english'))

clean_words = [w for w in word_tokens if not w in stop_words]
clean_words

['google', 'rolls', 'stories', 'tricked', 'photo', 'playback']



---



In [None]:
#apply to dataframe

def punctuation_removal(text):
  #remove punctuation that is not important
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)
  nopunc = nopunc.lower()
  return nopunc

def tokenization(text):
  word_tokens = word_tokenize(text)
  return word_tokens

def stopword_removal(text):
  #remove useless words in data
  stop_words = set(stopwords.words('english'))
  clean_words = [w for w in text if not w in stop_words]
  return clean_words


In [None]:
news_data['News Title (Punctuation Removed)'] = news_data['News Title'].apply(punctuation_removal)
news_data['News Title (Tokenized)'] = news_data['News Title (Punctuation Removed)'].apply(tokenization)
news_data['News Title (Stopword Removed)'] = news_data['News Title (Tokenized)'].apply(stopword_removal)

In [None]:
news_data.head()

Unnamed: 0_level_0,News Title,Category,News Title (Punctuation Removed),News Title (Tokenized),News Title (Stopword Removed)
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Google+ rolls out 'Stories' for tricked out ph...,Technology,google rolls out stories for tricked out photo...,"[google, rolls, out, stories, for, tricked, ou...","[google, rolls, stories, tricked, photo, playb..."
2,Dov Charney's Redeeming Quality,Business,dov charneys redeeming quality,"[dov, charneys, redeeming, quality]","[dov, charneys, redeeming, quality]"
3,White God adds Un Certain Regard to the Palm Dog,Entertainment,white god adds un certain regard to the palm dog,"[white, god, adds, un, certain, regard, to, th...","[white, god, adds, un, certain, regard, palm, ..."
4,"Google shows off Androids for wearables, cars,...",Technology,google shows off androids for wearables cars tvs,"[google, shows, off, androids, for, wearables,...","[google, shows, androids, wearables, cars, tvs]"
5,China May new bank loans at 870.8 bln yuan,Business,china may new bank loans at 8708 bln yuan,"[china, may, new, bank, loans, at, 8708, bln, ...","[china, may, new, bank, loans, 8708, bln, yuan]"


#**Data Pre-processing**

###Stemming


In [None]:
# example implementation of stemming
ps = PorterStemmer()

stemmed_words = []

for w in clean_words:
  stemmed_words.append(ps.stem(w))

' '.join(stemmed_words)


'googl roll stori trick photo playback'

###Lemmatization

In [None]:
# example implementation of lemmatization
lem = WordNetLemmatizer()

lem_words = []

for w in clean_words:
  lem_words.append(lem.lemmatize(w))

' '.join(lem_words)

'google roll story tricked photo playback'

In [None]:
#apply to dataframe

def stemming(text):
  stemmed_words = []
  for w in text:
    stemmed_words.append(ps.stem(w))
  return ' '.join(stemmed_words)

def lemmatization(text):
  lem_words = []
  for w in text:
    lem_words.append(lem.lemmatize(w))
  return ' '.join(lem_words)


In [None]:
news_data['News Title (Stemming)'] = news_data['News Title (Stopword Removed)'].apply(stemming)
news_data['News Title (Lemmatization)'] = news_data['News Title (Stopword Removed)'].apply(lemmatization)

news_data.head()

Unnamed: 0_level_0,News Title,Category,News Title (Punctuation Removed),News Title (Tokenized),News Title (Stopword Removed),News Title (Stemming),News Title (Lemmatization)
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Google+ rolls out 'Stories' for tricked out ph...,Technology,google rolls out stories for tricked out photo...,"[google, rolls, out, stories, for, tricked, ou...","[google, rolls, stories, tricked, photo, playb...",googl roll stori trick photo playback,google roll story tricked photo playback
2,Dov Charney's Redeeming Quality,Business,dov charneys redeeming quality,"[dov, charneys, redeeming, quality]","[dov, charneys, redeeming, quality]",dov charney redeem qualiti,dov charneys redeeming quality
3,White God adds Un Certain Regard to the Palm Dog,Entertainment,white god adds un certain regard to the palm dog,"[white, god, adds, un, certain, regard, to, th...","[white, god, adds, un, certain, regard, palm, ...",white god add un certain regard palm dog,white god add un certain regard palm dog
4,"Google shows off Androids for wearables, cars,...",Technology,google shows off androids for wearables cars tvs,"[google, shows, off, androids, for, wearables,...","[google, shows, androids, wearables, cars, tvs]",googl show android wearabl car tv,google show android wearable car tv
5,China May new bank loans at 870.8 bln yuan,Business,china may new bank loans at 8708 bln yuan,"[china, may, new, bank, loans, at, 8708, bln, ...","[china, may, new, bank, loans, 8708, bln, yuan]",china may new bank loan 8708 bln yuan,china may new bank loan 8708 bln yuan


###Bag of Words

In [None]:
#example Bag of Words

sent1 = 'my favorite search engine is google'
sent2 = 'hell ya google'

cv = CountVectorizer()

X = cv.fit_transform([sent1,sent2])

bow_sklearn = pd.DataFrame(X.toarray(),columns=cv.get_feature_names_out()) # newer version of scikit-learn use "cv.get_feature_names()"
bow_sklearn.head()

Unnamed: 0,engine,favorite,google,hell,is,my,search,ya
0,1,1,1,0,1,1,1,0
1,0,0,1,1,0,0,0,1


In [None]:
# calculate the Bag of Word in news data
message_bow = CountVectorizer().fit_transform(np.array(list(news_data['News Title (Lemmatization)'])))
message_bow.shape

(65535, 29645)

In [None]:
#split the dataset into 80% training data and 20% testing data
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(message_bow, news_data['Category'], test_size=0.20, random_state=0)


In [None]:
# naive-bayes model
def modelling(model, X_train, y_train, X_text, y_test):
  #basic machine learning
  MNB = model
  MNB.fit(X_train, y_train)

  y_train_pred = MNB.predict(X_train)
  y_test_pred = MNB.predict(X_text)

  # Compute accuracy on training set
  train_accuracy = MNB.score(X_train, y_train)

  # Compute accuracy on testing set
  test_accuracy = MNB.score(X_text, y_test)

  print('Training accuracy {:.2f}%'.format(train_accuracy*100))
  print('Testing accuracy {:.2f}%'.format(test_accuracy*100))

In [None]:
# modelling and test the model with the test data
modelling(MultinomialNB(), X_train_bow, y_train_bow, X_test_bow, y_test_bow)

Training accuracy 93.75%
Testing accuracy 90.67%


###TF-IDF


In [None]:
idf_matrix = TfidfVectorizer(min_df=0., max_df=1., use_idf=True).fit_transform(np.array(list(news_data['News Title (Lemmatization)'])))
idf_matrix.shape

(65535, 29645)

In [None]:
#split the dataset into 80% training data and 20% testing data
X_train_idf, X_test_idf, y_train_idf, y_test_idf = train_test_split(idf_matrix, news_data['Category'], test_size=0.20, random_state=0)

In [None]:
# modelling and test the model with the test data
modelling(MultinomialNB(), X_train_idf, y_train_idf, X_test_idf, y_test_idf)

Training accuracy 92.68%
Testing accuracy 89.67%


## **Logistic Regression Model with TF-IDF**


In [None]:
def modelling2(model, X_train, y_train, X_test, y_test):
    lr_model = model
    lr_model.fit(X_train, y_train)

    y_train_pred = lr_model.predict(X_train)
    y_test_pred = lr_model.predict(X_test)

    # Compute accuracy on training set
    train_accuracy = lr_model.score(X_train, y_train)

    # Compute accuracy on testing set
    test_accuracy = lr_model.score(X_test, y_test)

    print('Training accuracy: {:.2f}%'.format(train_accuracy * 100))
    print('Testing accuracy: {:.2f}%'.format(test_accuracy * 100))

In [None]:
# modelling and test the model with the test data
modelling2(LogisticRegression(max_iter=1000), X_train_idf, y_train_idf, X_test_idf, y_test_idf)

Training accuracy: 95.34%
Testing accuracy: 91.10%
