In [5]:
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge

### In this task I need to predict salary according to short description, location and contract time

In [6]:
data_train = pd.read_csv('salary-train.csv')
data_test = pd.read_csv('salary-test-mini.csv')

In [7]:
data_train['FullDescription'] = data_train.FullDescription.str.lower()
data_train['FullDescription'] = data_train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
data_test['FullDescription'] = data_test.FullDescription.str.lower()
data_test['FullDescription'] = data_test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

### At first I code the description with TF-IDF

In [8]:
tf = TfidfVectorizer(min_df=5)
result = tf.fit_transform(data_train.FullDescription)
test = tf.transform(data_test.FullDescription)

In [9]:
data_train['LocationNormalized'].fillna('nan', inplace=True)
data_train['ContractTime'].fillna('nan', inplace=True)

data_test['LocationNormalized'].fillna('nan', inplace=True)
data_test['ContractTime'].fillna('nan', inplace=True)

### Make a dictionary with all locations and Contract time

In [10]:
enc = DictVectorizer()
X_train_categ = enc.fit_transform(data_train[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_categ = enc.transform(data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [11]:
train = scipy.sparse.hstack([result, X_train_categ])
pred = scipy.sparse.hstack([test, X_test_categ])


### Make a prediciton and write it to the file

In [12]:
ridger = Ridge(alpha=1, random_state=241)
ridger.fit(train, data_train['SalaryNormalized'])
res = ridger.predict(pred)

In [13]:
string = str(round(res[0], 2)) + ' ' + str(round(res[1], 2))

In [14]:
print(string)
with open('answer', 'w') as f:
    f.write(string)

56582.89 37133.97
