# US Software Engineers Analysis - Solution

In [1]:
# we start by importing all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [None]:
# we read the csv and print the first 5 rows
df = pd.read_csv('data.csv', index_col=0)
df.head()

In [None]:
# Get information about columns
df.info()

In [None]:
# Get statistical data about the dataset
df.describe()

In [None]:
for i in df.columns:
    print(i, df[i].value_counts())

## Question 1

In [None]:
df["title"].value_counts()[:3]

## Question 2

In [None]:
remote = df[df["location"] == "Remote"]
non_remote = df[df["location"] != "Remote"]
len(remote),len(non_remote)

In [None]:
print("Comparison Ratings remote and not-remote jobs:")
print(f"Median Rating, Remote: {remote.rating.median():.2f}, Non-Remote: {non_remote.rating.median():.2f}")
print(f"Mean Rating, Remote: {remote.rating.mean():.2f}, Non-Remote: {non_remote.rating.mean():.2f}")

In [None]:
remote_distribution = (remote['rating'].map(lambda x: round(x, 0)).value_counts(normalize=True)).sort_index()
remote_distribution
# Calculate percentage distribution for not_remote dataframe
non_remote_distribution = (non_remote['rating'].map(lambda x: round(x, 0)).value_counts(normalize=True)).sort_index()

f, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].set_title('Remote')
ax[0].bar(remote_distribution.index, remote_distribution.values)
ax[0].set_xlabel('Rating')
ax[0].set_ylabel('Percentage distribution')

ax[1].set_title("Not Remote")
ax[1].bar(non_remote_distribution.index, non_remote_distribution.values)
ax[1].set_xlabel('Rating')

It is difficult to evaluate the result, because there are much more NaN values in the remote dataset. We can see that more people voted in the non-remote dataset and that most people who voted in both datasets gave the rating 4. The average and median rating is higher in the not remote dataset.

## Question 3

In [None]:
remote.salary.value_counts()

1. We have to make a few assumptions about the distribution. If a range is given then we calculate the mean. If it's a hourly salary then we multiply it by 40 * 52 since we assume the person works for 40 hours a week and a year consists of 52 weeks.  
2. We have to code this requirements since we need integers or floats to calculate the average salary

In [None]:
remote.salary.info()

In [None]:
non_remote.salary.info()

In [None]:
data = pd.Series(["$132,000 - $192,000 a year", "$110,000 - $150,000 a year"])

# Define a function to extract numbers from a string
def extract_numbers(text):
    text = str(text)
    text = text.replace(",", "")
    numbers = re.findall(r'\d+', text)
    return [int(number) if int(number) > 2000 else int(number) * 40 * 52 for number in numbers]  
    # we assume that no one has a larger hourly salary than 2000 dollars

def formatter(pd_series):
    # Apply the function to each element in the Series to get the sublists of numbers
    sublists = pd_series.apply(extract_numbers)

    # Calculate the mean for each sublist
    means = sublists.apply(lambda sublist: sum(sublist) / len(sublist) if len(sublist) > 0 else np.nan) 
    return means

remote = remote.dropna(subset=["salary"])
non_remote = non_remote.dropna(subset=["salary"])

salaries_remote = formatter(remote.salary)
salaries_not_remote = formatter(non_remote.salary)
print(f"Average Salaries, Remote: {salaries_remote.mean():.2f}, Non Remote: {salaries_not_remote.mean():.2f}")    

We can see that the salaries are pretty similiar and it doesn't make a big difference salary wise if you work in the office or remote