# <p style="padding:10px;background-color:#860404;margin:0;color:white;font-family:newtimeroman;font-size:150%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:500">Import all you need</p>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

import plotly.io as pio
pio.renderers.default = "notebook"

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
import warnings
warnings. filterwarnings('ignore')

In [None]:
df= pd.read_csv("/kaggle/input/stackoverflow-developer-suvery-2022/survey_results_public.csv")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.info()

# <p style="padding:10px;background-color:#860404;margin:0;color:white;font-family:newtimeroman;font-size:150%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:500">Visualization</p>

In [None]:
def plot_line_chart(df, column, line=""):
    if line == '':
        line = df[column].value_counts().keys()[0:20]
    data = df[column].value_counts()[0:20]
    fig=px.line(x=line,y=data)
    fig.show()
    
def plot_bar_chart(df, column, line=""):
    if line == '':
        line = df[column].value_counts().keys()[0:20]
    data = df[column].value_counts()[0:20]
    fig=px.bar(x=line,y=data)
    fig.show()
    
def plot_pie_chart(df, column, line = ''):
    if line == '':
        line = df[column].value_counts().keys()[0:20]
    data = df[column].value_counts()[0:20]
    fig=px.pie(names=line,values=data)
    fig.show()

In [None]:
plot_bar_chart(df,"Country")
plot_line_chart(df,"Country")

In [None]:
line = ['developer', 'student', 'half-developer', 'hobby', 'used-developer', 'neither']
plot_line_chart(df, 'MainBranch', line)
plot_bar_chart(df, 'MainBranch', line)
plot_pie_chart(df, 'MainBranch', line)

In [None]:
plot_bar_chart(df,"Employment")
plot_line_chart(df,"Employment")
plot_pie_chart(df,"Employment")

In [None]:
plot_bar_chart(df,"EdLevel")
plot_pie_chart(df,"EdLevel")

In [None]:
df.columns

In [None]:
plot_bar_chart(df,"YearsCodePro")
plot_pie_chart(df,"YearsCodePro")

In [None]:
df[["DevType"]]

In [None]:
df_DevType = pd.DataFrame()
num = 0
str_list = []
for i in df['DevType']:
    num = num + 1
    if pd.isnull(i):
        continue
    data = i.split(";")
    for j in data:
        str_list.append(j)
df_DevType = pd.DataFrame(str_list, columns=['DevType'])
df_DevType.value_counts()

In [None]:
plot_bar_chart(df_DevType,"DevType")
plot_pie_chart(df_DevType,"DevType")

# <p style="padding:10px;background-color:#860404;margin:0;color:white;font-family:newtimeroman;font-size:150%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:500">Feature Selection</p>

In [None]:
df = df[["Country", "EdLevel", "YearsCodePro","Employment", "ConvertedCompYearly"]]
df = df.rename({"ConvertedCompYearly": "Salary"}, axis=1)
df.head()

In [None]:
df = df[df["Salary"].notnull()]
df.head()

In [None]:
df.info()

In [None]:
df = df.dropna()
df.isnull().sum()

In [None]:
df.columns

In [None]:
df["Country"].value_counts()

In [None]:
df["Employment"].value_counts()

In [None]:
def print_unique_col_values(df):
       for column in df:
            if df[column].dtypes=='object':
                print(f'{column}: {df[column].unique()}') 

In [None]:
print_unique_col_values(df)

In [None]:
df = df[df["Employment"] == "Employed, full-time"]
df = df.drop("Employment", axis=1)
df.info()

In [None]:
df['Country'].value_counts()

In [None]:
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [None]:
country_map = shorten_categories(df.Country.value_counts(), 199)
df['Country'] = df['Country'].map(country_map)
df.Country.value_counts()

In [None]:
fig = px.box(df, x="Country", y="Salary")
fig.show()

In [None]:
df = df[df["Salary"] <= 300000]
df = df[df["Salary"] >= 10000]
df = df[df['Country'] != 'Other']

In [None]:
fig = px.box(df, x="Country", y="Salary")
fig.show()

In [None]:
df["YearsCodePro"].unique()

In [None]:
def clean_experience(x):
    if x ==  'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCodePro'] = df['YearsCodePro'].apply(clean_experience)

In [None]:
df["EdLevel"].unique()

In [None]:
def clean_education(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    return 'Less than a Bachelors'

df['EdLevel'] = df['EdLevel'].apply(clean_education)

In [None]:
df["EdLevel"].unique()

In [None]:
df

In [None]:
df

In [None]:
!pip install --upgrade pip -q
!pip install packaging -q
!pip install --pre pycaret -q
!pip install autoviz -q

In [None]:
from pycaret.regression import *

In [None]:
setup(data = df,target = 'Salary',session_id = 85)

In [None]:
compare_models() 

In [None]:
eda()

In [None]:
xgboost_model = create_model('xgboost')

In [None]:
interpret_model(xgboost_model)

In [None]:
# categorical_pipeline = Pipeline([('OneHot', OneHotEncoder())])
categorical_pipeline = Pipeline([('LE', LabelEncoder())])

numeric_pipeline = Pipeline([('Scaler', StandardScaler())]) 
# transformer = ColumnTransformer([('Category', categorical_pipeline, ['Country'])],remainder='passthrough') 
                                 
transformer = ColumnTransformer([('Num', numeric_pipeline, ['YearsCodePro']), 
                                 ('Category', categorical_pipeline, ['Country','EdLevel'])])


In [None]:
# For future works

In [None]:
# X = df.drop("Salary", axis=1)
# y = df["Salary"]

In [None]:
# X = df.iloc[:,:3]
# y = df["Salary"]

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10, random_state=85) 

In [None]:
# X_train.sample(5)

In [None]:
# X_train = transformer.fit_transform(X_train)
# X_test = transformer.transform(X_test)

In [None]:
# from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# model.fit(X_train,y_train)