# Salaries

https://machinelearningmastery.com/stacking-ensemble-machine-learning-with-python/


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

ds = pd.read_csv('ds_salaries.csv')
ds = ds.drop(['salary'], axis = 1)
# we don't have any nulls 
print(ds.columns)

# NLP:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')



Index(['Unnamed: 0', 'work_year', 'experience_level', 'employment_type',
       'job_title', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shahafdan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
ds['job_title'] = [tokenizer.tokenize(x.lower()) for x in ds['job_title']]
ds['job_title'] = ds['job_title'].apply(lambda x: [w for w in x if not w in stop_words])
ds['job_title'] = [' '.join(x) for x in ds['job_title']]

ds['salary_currency'] = pd.factorize(ds['salary_currency'])[0]
ds['experience_level'] = pd.factorize(ds['experience_level'])[0]
ds['employee_residence'] = pd.factorize(ds['employee_residence'])[0]
ds['company_location'] = pd.factorize(ds['company_location'])[0]
ds['company_size'] = pd.factorize(ds['company_size'])[0]
ds['employment_type'] = pd.factorize(ds['employment_type'])[0]
ds['remote_ratio'] = pd.factorize(ds['remote_ratio'])[0]

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler


# Set X and y
X = ds[['salary_currency', 'experience_level', 'employee_residence', 'company_size', 'company_location', 'employment_type', 'remote_ratio']]
y = ds['salary_in_usd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
model = RandomForestRegressor()
model.fit(X_train, y_train)
rf_predictions = model.predict(X_test)
rf_score = model.score(X_test, y_test)

model = LinearRegression()
model.fit(X_train, y_train)
linreg_predictions = model.predict(X_test)
linreg_score = model.score(X_test, y_test)

model = KNeighborsRegressor()
model.fit(X_train, y_train)
knn_pred = model.predict(X_test)
knn_score = model.score(X_test, y_test)

pred_test = pd.DataFrame({
    'test': np.array(y_test), 
    'rf_predictions': list(np.array(rf_predictions)), 
    'knn_predictions': list(np.array(knn_pred)), 
    'svr_predictions': list(np.array(svr_pred)), 
    'linreg_predictions': list(np.array(linreg_predictions))}, 
    columns=['test', 'rf_predictions', 'knn_predictions', 'svr_predictions', 'linreg_predictions'])
pred_test = pred_test.sort_values(by = 'test')
ind = range(len(pred_test))

plt.figure(figsize = (25, 6))
plt.title(model.score(X_test, y_test))
plt.plot(ind, pred_test['test'], linewidth = 2, color = 'orange')
plt.plot(ind, pred_test['rf_predictions'], linewidth = 2, color = 'blue', label = "Random Forest: " + str(round(100 * rf_score, 2)) + "% accuracy")
plt.plot(ind, pred_test['linreg_predictions'], linewidth = 2, color = 'green', label = "Linear Regression: " + str(round(100 * linreg_score, 2)) + "% accuracy")
plt.plot(ind, pred_test['knn_predictions'], linewidth = 2, color = 'red', label = "KNN Regressor: " + str(round(100 * knn_score, 2)) + "% accuracy")
plt.plot(ind, pred_test['svr_predictions'], linewidth = 2, color = 'pink', label = "SVR Regressor: " + str(round(100 * svr_score, 2)) + "% accuracy")
plt.legend()
plt.show()




X = ds[['job_title', 'salary_currency', 'experience_level', 'employee_residence', 'company_size', 'company_location', 'employment_type', 'remote_ratio']]
y = ds['salary_in_usd']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

vectorizer = TfidfVectorizer()
column_transformer = ColumnTransformer([('tfidf', vectorizer, 'job_title')], remainder = 'passthrough')
tfidf = vectorizer.fit_transform(X_train)
model = LinearRegression()
pipe = Pipeline([('tfidf', column_transformer), ('classify', model)])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
# print(X_train)
# predictions = pipe.predict(X_test, y_test)

# pred_test = pd.DataFrame({'test': np.array(y_test), 'predictions': list(np.array(predictions))}, columns=['test', 'predictions'])
# pred_test = pred_test.sort_values(by = 'test')
# ind = range(len(pred_test))

# plt.figure(figsize = (25, 15))
# plt.plot(ind, pred_test['test'])
# plt.plot(ind, pred_test['predictions'])
# plt.show()

ValueError: Expected 2D array, got 1D array instead:
array=[125000.  88654. 144854. 152500.  38400.  65438.  99360. 150000. 180000.
 153667. 147000. 140000. 135000. 160080. 170000.  53192.  19609. 211500.
 105000.   8000.  13400.  90320. 170000.  86703.  40189. 130000.  45807.
 450000.  79833.  67000. 130000.  61300. 132320. 102100.  72212.  47899.
 154600.  50000.  80000. 123000.  66022.   5707.  32974. 140000. 100000.
  69999. 100000. 130000. 160000. 380000.  45807. 184700. 162674. 250000.
  60000. 150000.  76958. 276000. 160000. 150000.  54000. 200000. 230000.
  98158.  99000.  81666.  88654. 209100.  65949. 173762. 200000.  91237.
  18907. 180000.  99000.  65438.  90000. 100000. 160000.  10354. 104702.
 220000.   4000. 135000.  62651.  96282.  25532. 115000.  80000. 123000.
 205300.  48000.  45391.  50000. 240000.  93700.  35735. 135000.  56000.
  60000. 210000.  38776. 146000. 209100.  87932. 130000. 220110.  70139.
  98158. 325000.  21983.  46597. 161342. 136620.  50180. 159000. 177000.
 118187.  20171.  93700. 115934.  64849. 144000. 120000.  39263. 108800.
 196979.  60757.  70000.  12000.  98000. 235000.  90320.  49268. 118000.
 150260.  33808. 114047. 117789.  99703. 167000. 126500. 152000. 220110.
 192600.  76833.  29751.  22611. 165400.  90320.  78526.  72000.  15966.
 155000.  90320. 155000.  41689. 105000.  45807. 100000.  20000. 115500.
 215300.  70500. 164996.  52351. 416000.  42000. 200000.  66265.   6072.
  71786. 250000. 423000.  46809. 106260. 174000.  75000.  78526.  96113.
  70912. 170000. 110925.  68147. 190200. 160080.  55000. 157000.  28369.
 242000. 174000.  75774. 189650. 128875.  75000. 167000.  62000.  87932.
 235000. 140400. 105000. 200000. 120000.  58894. 100000. 165000. 135000.
 150000. 175000. 180000. 135000. 128875. 103000.  80000.  28399. 123000.
 105400. 138350. 113476.  58000.  82900.  16904.  52396.  84900. 168000.
  93150. 135000. 112900.  10000.  87932. 106260.  54094. 405000. 165000.
  62649. 220000. 141300.  90000. 100800. 113000. 124190. 216000. 110037.
 210000.  12000.  82500.  21637. 101570. 450000. 189650. 120000.  59102.
  90000.  80000.  50000.  61896.  37825. 180000.  36643.  80000.  65949.
 120000. 260000. 213120. 145000. 151000.  87932. 160000.  56256. 224000.
  54742. 140000.  40570. 160000. 158200. 145000. 140400. 115000.  60000.
  52351.  37300. 140000. 170000. 200100. 100000. 241000. 103160.   5679.
 160000. 104890. 105000.  54957.  52351. 150000.  78000. 200000. 116000.
  76940. 130026. 120160.  63711.  71982.  24342. 324000. 185000. 200000.
 225000.  60000.  65000. 132000.  79039. 225000. 210000.  18442.  95550.
   9466. 170000.  94665. 140400. 106000. 115000.  65000.  78791.  24000.
  43966. 210000. 112872. 150000.  62726. 105000.  28476. 110000.  50000.
 130000. 125000.  39916.  74130. 100000. 170000. 150000.  63810. 135000.
  87425. 116914. 205300.  89294.  91000. 103691.  20000. 137141.  20000.
  81000.  58255. 214000.  79833.  25000. 100000. 200000. 112900.  69741.
 192400. 117789.  93000.  90734. 150000. 200000. 140000. 102100.  43966.
  75000.  87738. 180000. 144000. 100000.   5882.  90700.  82528.  70000.
 130000. 185000. 130800.  58000.  40000. 188000.  68428.  60000. 111775.
 110000.  61467. 100000.  33511.  85000.  16228.  31875. 208775.  51064.
  63900. 120000.  93427. 220000.  21669.  50000.  20000. 192564.  69336.
  45618.  85000. 119059.  45760.  35590. 147800.  69741.  28609.  45896.
 243900. 150000.  90000. 100000.  37236. 129000.  65013. 195000.  85000.
 154600. 185100.  51519. 124333. 260000. 116150.  79197.  12901.  46759.
 120000.  49461. 125000. 136000.  76833. 109280. 156600.  80000.  87000.
 165220. 164000.  85000.  80000.  52000. 210000. 230000. 115000. 165000.
 170000. 175100.  74000.  91614. 150000. 120000. 183228. 205300. 600000.
   6072.  78526.  31615. 230000.  58000.  21844. 132320.  49461.  75000.
 167875. 126500. 100000.  95746. 164996. 110000.  18442. 112900.  24823.
 144000. 200000.  43331.  42197. 187442.  72500.  91614.  36259.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.