In [None]:
import pandas as pd
import random
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
train_data=pd.read_csv('train_features.csv')
test_data=pd.read_csv('test_features.csv')
train_salary=pd.read_csv('train_salaries.csv')

category_feat = [col_1 for col_1 in train_data.columns if col_1 not in ('jobId','yearsExperience','milesFromMetropolis')]
cols_none = [col for col in  category_feat if len(train_data[train_data[col] == 'NONE']) > 0]
# Columns Degree and Major were found to have 'NONE' values.

degrees=train_data['degree'].unique()
majors=train_data['major'].unique()

def degree_update(jobType,degree,major):
    if degree == 'NONE':
        if jobType == 'JANITOR':
            #Every row with the JobType 'JANITOR' had degree as 'HIGH_SCHOOL', 
            #so similar value is used for rows with missing values
            return 'HIGH_SCHOOL' 
        else:
            #Every row other than 'JANITOR' jobType had a random and equal distribution of degree, 
            #the same is applied to other rows with missing values
            return random.choice(degrees)
    else:
        return degree
def major_update(degree,major):
    if major == 'NONE':
        if degree == 'HIGH_SCHOOL':
            #Because the 'HIGH_SCHOOL' values cant have a major, the empty values are given a new term.
            return 'NO_MAJOR'
        else:
            #Every row had a random and equal distribution of Major, 
            #the same is applied to other rows with missing values            
            return random.choice(majors)
    else:
        return major                                 
def feature_modification(dataframe):    
    dataframe['degree']= dataframe.apply(lambda row: degree_update(row['jobType'],row['degree'],row['major']), axis=1)
    dataframe['major']= dataframe.apply(lambda row: major_update(row['degree'],row['major']), axis=1)
    return dataframe

train_data_updated=feature_modification(train_data)
test_data_updated=feature_modification(test_data)

enc=preprocessing.LabelEncoder()
for column in category_feat:
    enc.fit(train_data_updated[column].unique())
    train_data_updated[column] = enc.transform(train_data_updated[column])
    test_data_updated[column] = enc.transform(test_data_updated[column])

train_dataframe=pd.merge(train_data_updated, train_salary, on='jobId', how='outer')
X=train_dataframe[train_dataframe.columns.difference(['jobId','salary'])]
y=train_dataframe['salary']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

#Model selection using evaluation metric GridSearchCV
param_grid = {"n_estimators": [200, 500], "max_depth": [3, None], "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False]}
model = RandomForestRegressor(random_state=0)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_params_)

model_reg = RandomForestRegressor(n_estimators = 200, max_depth = None, min_samples_split = 2, min_samples_leaf = 1, n_jobs = -1)
model_reg.fit(X_train,y_train)
score_test=model_reg.score(X_test, y_test)
y_pred = model_reg.predict(X_test)

print("Root mean squared error: {:.2f}".format(sqrt(mean_squared_error(y_test, y_pred))))
print("r2_score: {:.2f}".format(r2_score(y_test, y_pred)))

#Feature importances listed in ascending order
feature_values = pd.DataFrame(data=model_reg.feature_importances_, index=X.columns.values, columns=['values'])
feature_values.sort_values(['values'], ascending=False, inplace=True)
feature_values.transpose()

test_jobId = pd.Series(test_data_updated['jobId'])
test_data_updated_feat = test_data_updated[test_data_updated.columns.difference(['jobId'])]
predictions = model_reg.predict(test_data_updated_feat) 
predictions = pd.Series(predictions)    
final_prediction = pd.DataFrame(data = {'jobId': test_jobId, 'salary': predictions}, columns = ['jobId', 'salary'])
final_prediction.to_csv('final_prediction.csv', index=False)