# Intro to Data Science 
## Part XII. - Solving Kaggle's Job Salary prediction competition

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import random
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error

## I. Read a small sample
Note: download and extract the **`Train_rev1.csv`** data file from [kaggle](https://www.kaggle.com/c/job-salary-prediction/data) and rename it to `job_salary_prediction.csv`.

In [None]:
random.seed(137)
with open("data/job_salary_prediction.csv", "br") as infile:
    numlines = len(infile.readlines())
skip_index = random.sample(range(1, numlines), numlines - int(numlines/10))

In [None]:
data = pd.read_csv("data/job_salary_prediction.csv", index_col='Id', skiprows=skip_index)
data.head()

### Check null values

In [None]:
data.isnull().sum() / len(data)

In [None]:
print( 'ContractType')
print( data.ContractType.unique())
print()
print( 'ContractTime')
print( data.ContractTime.unique())

In [None]:
print( 'ContractType')
print( data.ContractType.describe())
print()
print( 'ContractTime')
print( data.ContractTime.describe())

ContractType and ContractTime could be easily inputted, but since the majority of values are the same, we skip it.

### Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.FullDescription, data.SalaryNormalized, test_size=0.25, random_state=137)

df_train, df_test, y_train, y_test = train_test_split(
    data, 
    data.SalaryNormalized,
    test_size=0.25, 
    random_state=137
)

---

## II. Baseline model

In order to put our models to context we create a baseline model which represents "coin toss" decision.

In [None]:
class Baseline(BaseEstimator, RegressorMixin):
    
    def fit(self, X, y):
        """Calculate mean value for y.
        
        Parameters
        ----------
        X : array-like
            not used
        y : array-like
            target values
        
        Returns
        -------
        self : object
            Returns self.
        """
        self.value = y.mean()
        return self
    
    def predict(self, X):
        """Returns calculated mean for every item in X.
        
        Parameters
        ----------
        X : array-like
            Input matrix

        Returns
        -------
        y : array-like
            "Predicted" target values for X.
        """
        if not hasattr(self, 'value'):
            raise ValueError("Regressor not trained yet. Use fit method first.")
        target_shape = (X.shape[0], )
        return np.ones(target_shape) * self.value

In [None]:
basepipe = Pipeline(steps=[
    ("base", Baseline())
])

basepipe.fit(X_train, y_train)

In [None]:
mean_absolute_error(y_test, basepipe.predict(X_test))

This is our baseline prediction error. 

---

## III. First model: 
### Regression model, using only FullDescription

In [None]:
tfidf = TfidfVectorizer(min_df=0.1, max_df=0.9, stop_words='english')
tfidf.fit(data.FullDescription)
print("Vocabulary size:", len(tfidf.vocabulary_))

In [None]:
linpipe = Pipeline(steps=[
    ("tfidf", tfidf),
    ("linreg", LinearRegression())
])
linpipe.fit(X_train, y_train)

print(mean_absolute_error(y_test, linpipe.predict(X_test)))

In [None]:
ridegpipe = Pipeline(steps=[
    ("tfidf", tfidf),
    ("rideg", Ridge(random_state=137))
])
ridegpipe.fit(X_train, y_train)

print(mean_absolute_error(y_test, ridegpipe.predict(X_test)))

### Exercise: Hyperparameter optimization

Optimize `min_df`, `max_df` and `rideg__alpha` parameters!

Don't forget to use the *data.FullDescription* and *data.SalaryNormalized* instead of the train-test splits.

## IV: Using multiple features

Q: What if we need to use different transformations for each feature?  
A: ColumnTransformer

https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
feat = ColumnTransformer([
    ('count', CountVectorizer(), 'Title'),
    ('tfidf', TfidfVectorizer(min_df=0.1, max_df=0.9, stop_words='english'), 'FullDescription')
])

In [None]:
ulti_lin_pipe = Pipeline([
    ('union', feat),
    ('linreg', LinearRegression())
])
ulti_lin_pipe.fit(df_train, y_train)

print(mean_absolute_error(y_test, ulti_lin_pipe.predict(df_test)))

In [None]:
ulti_rideg_pipe = Pipeline([
    ('union', feat),
    ('rideg', Ridge(random_state=137))
])
ulti_rideg_pipe.fit(df_train, y_train)

print(mean_absolute_error(y_test, ulti_rideg_pipe.predict(df_test)))

In [None]:
ulti_rideg_pipe = Pipeline([
    ('union', feat),
    ('rideg', Ridge(random_state=137))
])
ulti_rideg_pipe.fit(df_train, y_train)

print(mean_absolute_error(y_test, ulti_rideg_pipe.predict(df_test)))

### Exercise II: Add more columns

Following the same process, add every reasonable column with preprocessing to the pipelines and inspect `LinearRegression` and `Ridge` errors.

### Exercise III: Hyperparameter optimization

Find the previously assembled pipeline's optimal parameters. To easily find the correct parameter names use the pipeline's `get_params` method.

In [None]:
ulti_rideg_pipe_param_names = ulti_rideg_pipe.get_params().keys()

print("ulti_rideg_pipe's parameter names:")
print('-', '\n- '.join(ulti_rideg_pipe_param_names))