# ML Homework 1

## Imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier


## Generate data

### Create DataFrame from randomized data

In [8]:
scores = ['A', 'B', 'C']

In [9]:
def get_random_data(score):
    min_ = 1
    if score == 'A':
        min_ = 50
    elif score == 'B':
        min_ = 30

    return {
        "math": np.random.randint(min_, 100),
        "physics": np.random.randint(min_, 100),
        "sport": np.random.randint(min_, 100),
        "programming": np.random.randint(min_, 100),
        "english": np.random.randint(min_, 100),
        "history": np.random.randint(min_, 100),
        "score": score
    }


data = [get_random_data(np.random.choice(scores)) for _ in range(2000)]
df = pd.DataFrame(data)

In [10]:
df.head()

Unnamed: 0,math,physics,sport,programming,english,history,score
0,89,47,84,47,59,37,B
1,77,56,77,77,46,61,C
2,84,68,98,90,87,92,A
3,79,83,58,93,89,86,A
4,66,73,33,48,58,86,B


### Encoding target feature

In [11]:
label_enc = LabelEncoder()
df['score'] = label_enc.fit_transform(df['score'])

In [12]:
df.head()

Unnamed: 0,math,physics,sport,programming,english,history,score
0,89,47,84,47,59,37,1
1,77,56,77,77,46,61,2
2,84,68,98,90,87,92,0
3,79,83,58,93,89,86,0
4,66,73,33,48,58,86,1


## Splitting data

In [13]:
y = df['score']
X = df.drop('score', axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Choose top-3 best feature for training

In [15]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,math,physics,sport,programming,english,history,score
math,1.0,0.205587,0.195104,0.16488,0.209607,0.170442,-0.446511
physics,0.205587,1.0,0.196808,0.199636,0.200427,0.196611,-0.422427
sport,0.195104,0.196808,1.0,0.204539,0.149181,0.221742,-0.436074
programming,0.16488,0.199636,0.204539,1.0,0.144366,0.15672,-0.423247
english,0.209607,0.200427,0.149181,0.144366,1.0,0.177365,-0.403522
history,0.170442,0.196611,0.221742,0.15672,0.177365,1.0,-0.403723
score,-0.446511,-0.422427,-0.436074,-0.423247,-0.403522,-0.403723,1.0


<p>Топ-3 очевидно матан, прогерство и спорт!</p>

In [30]:
def get_predicted_score(scores, model, le_enc):
    x = pd.DataFrame([scores])
    return le_enc.inverse_transform(model.predict(x))

## CatBoost Model

### Training

In [16]:
catboost_model = CatBoostClassifier(iterations=50, learning_rate=0.1)
catboost_model.fit(
    X_train, y_train,
    verbose=False
)
y_pred = catboost_model.predict(X_test)

### Get accuracy score of model

In [17]:
print(accuracy_score(y_test, y_pred))

0.895


### Predict some score

In [32]:
score = {
    "math": 70,
    "physics": 70,
    "sport": 70,
    "programming": 70,
    "english": 70,
    "history": 70,
}
get_predicted_score(score, catboost_model, label_enc)[0]

  y = column_or_1d(y, warn=True)


'A'

## Logistic Regression Model

### Training

In [33]:
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)
y_pred = logreg_model.predict(X_test)

### Testing

In [34]:
accuracy_score(y_pred, y_test)

0.6875

### Predict

In [37]:
score = {
    "math": 40,
    "physics": 40,
    "sport": 40,
    "programming": 70,
    "english": 70,
    "history": 70,
}
get_predicted_score(score, logreg_model, label_enc)[0]

'C'