In [80]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


## Reading Data

In [81]:
resume=pd.read_csv('Raw_Resume.csv')
resume

Unnamed: 0,Category,Raw_Details
0,data_analyst,DIEGO PEREZ Data Analyst Intern\n\ndiego.perez...
1,data_analyst,FARAH MARTIN \tDATA ANALYST\n\n\t\t\t\t\t\t\tC...
2,data_analyst,DEREK SOTO Data Analyst\n\nDesoto@email.com\n\...
3,data_analyst,CHARLA SWAIN \nENTRY-LEVEL RISK ADJUSTMENT DAT...
4,data_analyst,TOMISLAV ABRAMOVIC Junior Data Analyst\n\n\n\n...
5,data_engineer,ALAN SUSA \nData Engineer\n\n\n\nalansusa@emai...
6,data_engineer,BRANDON CONNOR\n\nData Engineer\n\nCAREER OBJE...
7,data_engineer,Tinuviel Winters Lead Data Engineer\n\nt.winte...
8,data_engineer,CNun@email.com\n\n\t\tMontgomery Sills \t(123)...
9,data_engineer,dtrevino@email.com\n\n\t\tDaniel Trevino (123)...


In [82]:
resume.dtypes

Category       object
Raw_Details    object
dtype: object

In [83]:
resume.shape

(20, 2)

In [84]:
resume.isna().sum()

Category       0
Raw_Details    0
dtype: int64

## Data Preprocessing

In [85]:
encoder = LabelEncoder()
resume['num_category']= encoder.fit_transform(resume['Category'])

In [86]:
resume.columns

Index(['Category', 'Raw_Details', 'num_category'], dtype='object')

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(resume['Raw_Details'])
y = resume["num_category"]

In [88]:
x

<20x1475 sparse matrix of type '<class 'numpy.int64'>'
	with 3865 stored elements in Compressed Sparse Row format>

In [89]:
y

0     0
1     0
2     0
3     0
4     0
5     1
6     1
7     1
8     1
9     1
10    3
11    3
12    3
13    3
14    3
15    2
16    2
17    2
18    2
19    2
Name: num_category, dtype: int32

## Model Building

### LogisticRegression

In [135]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [136]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=149)

In [137]:
model=LogisticRegression()


In [138]:
model.fit(x_train,y_train)

In [139]:
y_pred=model.predict(x_test)

In [140]:
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy:",accuracy)

# Generate classification report
report = classification_report(y_test, y_pred)

# Print the report
print("Classification Report:\n", report)

Accuracy: 0.78
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.90      0.84        21
           1       0.65      0.54      0.59        24
           2       0.92      0.92      0.92        26
           3       0.73      0.76      0.75        29

    accuracy                           0.78       100
   macro avg       0.77      0.78      0.78       100
weighted avg       0.77      0.78      0.78       100



### DecisionTree

In [141]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
# Generate synthetic data
x, y = make_classification(n_samples=500, n_features=21, n_classes=4, n_clusters_per_class=1, random_state=149)
# Split the dataset into training and testing sets
x_test,x_train,y_test,y_train=train_test_split(x,y,test_size=0.2,random_state=149)
decision_tree_model=DecisionTreeClassifier(random_state=149)


In [142]:
# Train the Decision Tree model
decision_tree_model.fit(x_train,y_train)

In [143]:
# Make predictions on the testing data
y_pred=decision_tree_model.predict(x_test)

In [144]:
# Evaluate the accuracy of the model
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy:",accuracy)

Accuracy: 0.82


In [145]:
# classification report
report = classification_report(y_test, y_pred)

print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.66      0.73       102
           1       0.70      0.73      0.71        99
           2       0.90      0.94      0.92       102
           3       0.86      0.96      0.91        97

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



### SVM

In [146]:
from sklearn.svm import SVC

In [147]:
# Generate synthetic data
x, y = make_classification(n_samples=500, n_features=21, n_classes=4, n_clusters_per_class=1, random_state=149)

In [148]:
# Split the data into training and testing sets
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=149)

In [162]:
#Initialize the SVM model
svm_model=SVC(kernel='rbf', C=1.0, gamma='scale', random_state=149)

In [152]:
# Train the SVM model
svm_model.fit(x_train, y_train)

In [154]:
# Make predictions on the testing data
y_pred = svm_model.predict(x_test)

In [155]:
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.88


In [156]:
# classification report
report = classification_report(y_test, y_pred)


print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.90      0.95        21
           1       0.73      0.79      0.76        24
           2       1.00      0.96      0.98        26
           3       0.83      0.86      0.85        29

    accuracy                           0.88       100
   macro avg       0.89      0.88      0.88       100
weighted avg       0.89      0.88      0.88       100



### Adaboost

In [157]:
from sklearn.ensemble import AdaBoostClassifier

In [158]:
# Generate synthetic data
x, y = make_classification(n_samples=500, n_features=21, n_classes=4, n_clusters_per_class=1, random_state=149)

In [160]:
# Split the data into training and testing sets
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=149)

In [163]:
# Initialize the AdaBoost model
ada_boost_model=AdaBoostClassifier(n_estimators=1000,learning_rate=0.1,random_state=149)

In [164]:
#Train the AdaBoost model
ada_boost_model.fit(x_train,y_train)

In [166]:
# make predictions on the testing data
y_pred=ada_boost_model.predict(x_test)

In [167]:
# Evalute the accuracy of the model
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.78


In [168]:
# Classification Report 
report=classification_report(y_test,y_pred)
print("Classification Report:\n",report )

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.90      0.83        21
           1       0.73      0.79      0.76        24
           2       0.86      0.69      0.77        26
           3       0.79      0.76      0.77        29

    accuracy                           0.78       100
   macro avg       0.78      0.79      0.78       100
weighted avg       0.79      0.78      0.78       100



### KNN

In [169]:
from sklearn.neighbors import KNeighborsClassifier

In [170]:
# Generate synthetic data for demonstration
x, y = make_classification(n_samples=500, n_features=5, n_classes=4, n_clusters_per_class=1, random_state=149)

In [171]:
# split the data into training and testing sets
x_test,x_train,y_test,y_train=train_test_split(x,y,test_size=0.2,random_state=149)

In [172]:
# Initialize the KNN model
k=8
knn_model=KNeighborsClassifier(n_neighbors=k)


In [174]:
# train the model on training dataq
knn_model.fit(x_train,y_train)

In [176]:
# Make prediction on the testing data 
y_pred=knn_model.predict(x_test)

In [177]:
# Evaluate the accuracy of the model
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy: \n",accuracy)

Accuracy: 
 0.7425


In [178]:
# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.89      0.89        99
           1       0.67      0.75      0.71        99
           2       0.72      0.90      0.80        97
           3       0.69      0.46      0.55       105

    accuracy                           0.74       400
   macro avg       0.74      0.75      0.74       400
weighted avg       0.74      0.74      0.73       400



###  GradientBoosting

In [179]:
from sklearn.ensemble import GradientBoostingClassifier

In [228]:
x,y= make_classification(n_samples=500, n_features=20, n_classes=4, n_clusters_per_class=1, random_state=149)

In [229]:
# split the data into testing and training sets
x_test,x_train,y_test,y_train=train_test_split(x,y,test_size=0.2,random_state=149)


In [230]:
# Initiaize the Gradientboosting model
gbm_model=GradientBoostingClassifier(n_estimators=400, learning_rate=0.1, max_depth=4, random_state=149)

In [231]:
# training the model
gbm_model.fit(x_train,y_train)

In [232]:
# make predictions on the testing data 
y_pred=gbm.predict(x_test)

In [234]:
# Evaluate the accuracy  of the model 
accuracy= accuracy_score(y_test,y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.27


In [235]:
# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.25      0.38      0.30        98
           1       0.28      0.68      0.40       104
           2       0.00      0.00      0.00       102
           3       0.00      0.00      0.00        96

    accuracy                           0.27       400
   macro avg       0.13      0.27      0.17       400
weighted avg       0.13      0.27      0.18       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##  lightgbm

In [237]:
 import lightgbm as lgb

In [247]:
x, y = make_classification(n_samples=500, n_features=30, n_classes=4, n_clusters_per_class=1, random_state=149)

In [248]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=149)

In [249]:
# Define parameters for LightGBM
params = {
    'objective': 'multiclass',
    'num_class': 4,
    'metric': 'multi_error',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'random_state': 149
}

In [250]:
# Initialize the LightGBM Classifier
lgbm = lgb.LGBMClassifier(**params, n_estimators=1000)

In [251]:
# Train the LightGBM model
lgbm.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000425 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4012
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 30
[LightGBM] [Info] Start training from score -1.366492
[LightGBM] [Info] Start training from score -1.366492
[LightGBM] [Info] Start training from score -1.469676
[LightGBM] [Info] Start training from score -1.347074






In [252]:
# Make predictions on the testing data
y_pred = lgbm.predict(x_test)

In [253]:
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9


In [254]:
# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.88      0.91        24
           1       0.74      0.87      0.80        23
           2       1.00      1.00      1.00        31
           3       0.90      0.82      0.86        22

    accuracy                           0.90       100
   macro avg       0.90      0.89      0.89       100
weighted avg       0.91      0.90      0.90       100



In [None]:
#END