In [4]:
import pandas as pd
import numpy as np
import pymysql

# Predictive Modeling

Currently, I only have a limited dataset, with just a few rows of data, which isn't sufficient to train more complex models like bagging or boosted trees, or to perform a proper train-test split. As a result, I'll be using simpler methods, such as logistic regression and k-Nearest Neighbors (kNN), until I collect enough data for more advanced modeling.

#### Loading Data

In [10]:
class DataQuerier:
    def __init__(self) -> None:
        self.connection = None
        self._connect()
        pass
    
    def _connect(self):
        endpoint = "leetcode.c9eq4wc6mqs0.us-east-2.rds.amazonaws.com"
        port = 3306
        username = "streamlit"
        password = "password123"
        database = "leetcode"

        try:
            self.connection = pymysql.connect(
                host=endpoint,
                user=username,
                password=password,
                database=database,
                port=port
            )
            print("Connection successful!")
            
        except pymysql.MySQLError as e:
            print(f"Error: {e}")
    
    def query(self, query):
        try:
            with self.connection.cursor() as cursor:
                cursor.execute(query)
                results = cursor.fetchall()
                print("Successfully Executed")
                
                column_names = [desc[0] for desc in cursor.description]
                
                if column_names:
                    return pd.DataFrame(results, columns = column_names)
                else:
                    return pd.DataFrame(results)
        except pymysql.MySQLError as e:
            print(f"Error: {e}")
    
    def close(self):
        print("Connection Closed")
        self.connection.close()

In [11]:
querier = DataQuerier()
df = querier.query("SELECT * FROM daily_problems")
querier.close()

Connection successful!
Successfully Executed
Querier Closed


### Preprocessing

In [16]:
print(f"Rows: {df.shape[0]}")

df.head(5)

Rows: 16


Unnamed: 0,id,date,name,complexity,acceptance_rate,time,language,chat_gpt,speed,memory,skills,notes
0,539,2024-09-16 07:43:57,Minimum Time Difference,Medium,59.9,2695.09,Python,0.0,7.38,70.9,"Array, Math, String, Sorting",Got stuck for a while doing a O(n^2) solution ...
1,725,2024-09-08 07:38:16,Split Linked List in Parts,Medium,67.5,1186.32,Python,0.0,,,Linked List,Very slow solution
2,874,2024-09-04 11:10:35,Walking Robot Simulation,Medium,52.2,4326.63,Python,0.0,,,"Array, Hash Table, Simulation",I completed 3 solution but none of them was fa...
3,947,2024-08-29 20:19:00,Most Stones Removed with Same Row or Column,Medium,62.0,3104.14,Python,,,,"Hash Table, Depth-First Search, Union Find, Graph",I'm so tired. I tried a solution with an adjac...
4,1310,2024-09-13 12:02:18,XOR Queries of a Subarray,Medium,77.6,899.995,Python,0.0,5.26,76.32,"Array, Bit Manipulation, Prefix Sum","First solution used to for loops, to slow. Opt..."


In [17]:
y = df["time"]

In [22]:
columns = ["complexity", "acceptance_rate", "skills"]
x = df[columns]

## Making a simple preprocessor compatible with SkLearn's Pipeline

In [40]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreprocessData(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        self.skills = None
        self.fitted = False
        
    def fit(self, df, y=None):
        self.skills = df['skills'].str.split(',').explode().str.strip().unique()
        self.fitted = True
        return self
    
    def transform(self, df):
        columns = ["complexity", "acceptance_rate", "skills"]
        
        if not self.fitted:
            raise AssertionError("Model has not been fit")
        df = df.copy()
        df = df[columns]
        df = self._ohe_complexity(df)
        df = self._ohe_skills(df)
        df = df.drop(["complexity", "skills"], axis = 1)
        return df 
    
    def _ohe_complexity(self, df) -> pd.DataFrame:
        complexity = ["Easy", "Medium", "Hard"]
        
        for level in complexity:
            df[f'complexity_{level}'] = df['complexity'].apply(lambda x: 1 if x == level else 0)
        
        return df
    
    def _ohe_skills(self, df) -> pd.DataFrame:
        for skill in self.skills:
            df[f'skill_{skill}'] = df['skills'].apply(lambda x: 1 if skill in x.split(',') else 0)
        
        return df

In [41]:
preproccessor = PreprocessData()
cleaned_x = preproccessor.fit_transform(x)

In [48]:
print(f"Features: {cleaned_x.shape[1]}")

Features: 26


## Fitting Logistic Regression

In [61]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

pipeline = Pipeline(steps=[
    ('preprocessor', PreprocessData()),
    ('regressor', LinearRegression())
])

In [59]:
# Fit the pipeline with data
pipeline.fit(df, y)

# Predict with the fitted model
y_pred = pipeline.predict(df)

In [65]:
print(f"Mean Squared Error: {mean_squared_error(y, y_pred)}")

Mean Squared Error: 865870.7116983548
