In [1]:
import pandas as pd
import numpy as np
import scipy

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge

In [2]:
df_train = pd.read_csv("./input_data/salary-train.csv")
df_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


## text and categorial features preparation

In [6]:
def text_preparation(df, text_feature="FullDescription"):
    X = pd.DataFrame()
    X[text_feature] = df[text_feature].str.lower()
    X[text_feature] = X[text_feature].replace('[^a-z0-9]', ' ', regex = True)
    return X

def categ_transform(df, cols_cat):
    X = pd.DataFrame()
    for name in cols_cat:
        X[name] = df[name]
        X[name].fillna("nan", inplace=True)
    return X

In [5]:
X_train_text = text_preparation(df_train)

vectorizer = TfidfVectorizer(min_df=5)
X_train_text = vectorizer.fit_transform(X_train_text["FullDescription"])

In [7]:
cols_cat = ["LocationNormalized", "ContractTime"]
X_train_cat = categ_transform(df_train, cols_cat)

enc = DictVectorizer()
X_train_cat = enc.fit_transform(X_train_cat[cols_cat].to_dict('records'))

### Join encoded categorical features and vectorized text

In [8]:
X_train = scipy.sparse.hstack([X_train_text, X_train_cat])

In [9]:
model = Ridge(alpha=1, random_state=241)
y = df_train["SalaryNormalized"]

In [10]:
model.fit(X_train, y)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [11]:
df_test = pd.read_csv("./input_data/salary-test-mini.csv")
df_test

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


## Test data transformation

In [12]:
X_test_text = text_preparation(df_test)
X_test_text = vectorizer.transform(X_test_text["FullDescription"])

In [13]:
X_test_cat = categ_transform(df_test, cols_cat)
X_test_cat = enc.transform(X_test_cat[cols_cat].to_dict('records'))

In [14]:
X_test = scipy.sparse.hstack([X_test_text, X_test_cat])

In [16]:
out = model.predict(X_test)

In [17]:
w = open("./week4/salary.dat", "w")
w.write(" ".join([f"{np.round(x,2)}" for x in out]))
w.close()