# Opening CSV

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("ds-salaries.csv")

In [3]:
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2021e,EN,FT,Data Science Consultant,54000,EUR,64369,DE,50,DE,L
1,2020,SE,FT,Data Scientist,60000,EUR,68428,GR,100,US,L
2,2021e,EX,FT,Head of Data Science,85000,USD,85000,RU,0,RU,M
3,2021e,EX,FT,Head of Data,230000,USD,230000,RU,50,RU,L
4,2021e,EN,FT,Machine Learning Engineer,125000,USD,125000,US,100,US,S
...,...,...,...,...,...,...,...,...,...,...,...
240,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
241,2021e,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
242,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
243,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           245 non-null    object
 1   experience_level    245 non-null    object
 2   employment_type     245 non-null    object
 3   job_title           245 non-null    object
 4   salary              245 non-null    int64 
 5   salary_currency     245 non-null    object
 6   salary_in_usd       245 non-null    int64 
 7   employee_residence  245 non-null    object
 8   remote_ratio        245 non-null    int64 
 9   company_location    245 non-null    object
 10  company_size        245 non-null    object
dtypes: int64(3), object(8)
memory usage: 21.2+ KB


In [5]:
df.describe()

Unnamed: 0,salary,salary_in_usd,remote_ratio
count,245.0,245.0,245.0
mean,502541.8,99868.012245,69.183673
std,2276230.0,83983.326949,37.593421
min,4000.0,2876.0,0.0
25%,60000.0,45896.0,50.0
50%,103000.0,81000.0,100.0
75%,174000.0,130000.0,100.0
max,30400000.0,600000.0,100.0


### Working with non-numerical attributes

In [36]:
non_numerical_set = set(df.columns) - {"salary", "salary_in_usd", "remote_ratio"}
non_numerical_list = list(non_numerical_set)

In [37]:
def non_numerical(value):
    cont = value.value_counts()
    unique = value.unique()
    nunique = value.nunique()
    
    string = f"VALUE_COUNTS = {cont} \n\nUNIQUE = {unique} \n\nNUNIQUE = {nunique}"
    
    return string

In [38]:
for i in non_numerical_list:
    print(non_numerical(df[i]), end = "\n\n\n")

VALUE_COUNTS = work_year
2021e    179
2020      66
Name: count, dtype: int64 

UNIQUE = ['2021e' '2020'] 

NUNIQUE = 2


VALUE_COUNTS = salary_currency
USD    126
EUR     57
INR     21
GBP     13
CAD     10
TRY      3
PLN      2
HUF      2
SGD      2
MXN      2
DKK      2
BRL      2
CLP      1
JPY      1
CNY      1
Name: count, dtype: int64 

UNIQUE = ['EUR' 'USD' 'CAD' 'INR' 'PLN' 'GBP' 'HUF' 'SGD' 'MXN' 'TRY' 'CLP' 'JPY'
 'DKK' 'CNY' 'BRL'] 

NUNIQUE = 15


VALUE_COUNTS = employee_residence
US    92
IN    22
DE    19
FR    13
GB    13
CA     9
ES     7
JP     4
NL     4
GR     4
BR     4
PL     3
PT     3
TR     3
IT     3
PK     3
RU     3
MX     2
DK     2
RO     2
AT     2
NG     2
SG     2
HU     2
VN     2
LU     1
MD     1
SI     1
HK     1
CN     1
HR     1
BE     1
CL     1
KE     1
IR     1
NZ     1
CO     1
BG     1
RS     1
PR     1
JE     1
AE     1
UA     1
PH     1
MT     1
Name: count, dtype: int64 

UNIQUE = ['DE' 'GR' 'RU' 'US' 'FR' 'AT' 'CA' 'UA' 'NG' 'PK' 'IN' 'GB'

### Numerical attributes

In [40]:
numerical_set = set(df.columns) - non_numerical_set
numerical_list = list(numerical_set)

['salary', 'remote_ratio', 'salary_in_usd']

In respect to central tedency, we are going to calculate the mean, median, mode, midpoint, max and min

In [41]:
def central_tedency(value):
    minimum = value.min()
    maximum = value.max()
    mean = value.mean()
    mode = value.mode()[0]
    median = value.median()
    midpoint = (maximum - minimum)/2
    
    string = f"Min = {minimum}, Max = {maximum}, Mean = {mean}, Mode = {mode}, Median = {median}, Midpoint = {midpoint}"
    
    return string

In [42]:
for i in numerical_list:
    print(f"For {i}, we have:")
    print(central_tedency(df[i]))

For salary, we have:
Min = 4000, Max = 30400000, Mean = 502541.81224489794, Mode = 80000, Median = 103000.0, Midpoint = 15198000.0
For remote_ratio, we have:
Min = 0, Max = 100, Mean = 69.18367346938776, Mode = 100, Median = 100.0, Midpoint = 50.0
For salary_in_usd, we have:
Min = 2876, Max = 600000, Mean = 99868.01224489795, Mode = 150000, Median = 81000.0, Midpoint = 298562.0


In respect of divergence, we are going to calculate the var, std, quantiles (1st, 2st and 3st quartiles) and range

In [49]:
def divergence(value):
    range = value.max() - value.min()
    var = value.var()
    std = value.std()
    quantile = value.quantile([0.25, 0.5, 0.75])
    
    string = f"Range = {range}, Var = {var}, Std = {std}, 1st quartile = {quantile[0.25]}, 2st quartile = {quantile[0.5]}, 3st quartile = {quantile[0.75]}"
    
    return string

In [50]:
for i in numerical_list:
    print(f"For {i}, we have:")
    print(divergence(df[i]))

For salary, we have:
Range = 30396000, Var = 5181223548855.601, Std = 2276230.1177287856, 1st quartile = 60000.0, 2st quartile = 103000.0, 3st quartile = 174000.0
For remote_ratio, we have:
Range = 100, Var = 1413.2653061224457, Std = 37.59342104840215, 1st quartile = 50.0, 2st quartile = 100.0, 3st quartile = 100.0
For salary_in_usd, we have:
Range = 597124, Var = 7053199205.446569, Std = 83983.32694914253, 1st quartile = 45896.0, 2st quartile = 81000.0, 3st quartile = 130000.0


About data correlation:

In [52]:
df[numerical_list].corr()

Unnamed: 0,salary,remote_ratio,salary_in_usd
salary,1.0,-0.004775,-0.087365
remote_ratio,-0.004775,1.0,0.17124
salary_in_usd,-0.087365,0.17124,1.0
