In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#load the dataset
df = pd.read_csv("Software_Professional_Salaries.csv")
df


Unnamed: 0,Rating,Company Name,Job Title,Salary,Salaries Reported,Location
0,3.8,Sasken,Android Developer,400000,3,Bangalore
1,4.5,Advanced Millennium Technologies,Android Developer,400000,3,Bangalore
2,4.0,Unacademy,Android Developer,1000000,3,Bangalore
3,3.8,SnapBizz Cloudtech,Android Developer,300000,3,Bangalore
4,4.4,Appoids Tech Solutions,Android Developer,600000,3,Bangalore
...,...,...,...,...,...,...
22769,4.7,Expert Solutions,Web Developer,200000,1,Bangalore
22770,4.0,Nextgen Innovation Labs,Web Developer,300000,1,Bangalore
22771,4.1,Fresher,Full Stack Web Developer,192000,13,Bangalore
22772,4.1,Accenture,Full Stack Web Developer,300000,7,Bangalore


In [3]:
#view the first few rows of the dataframe
df.head()

Unnamed: 0,Rating,Company Name,Job Title,Salary,Salaries Reported,Location
0,3.8,Sasken,Android Developer,400000,3,Bangalore
1,4.5,Advanced Millennium Technologies,Android Developer,400000,3,Bangalore
2,4.0,Unacademy,Android Developer,1000000,3,Bangalore
3,3.8,SnapBizz Cloudtech,Android Developer,300000,3,Bangalore
4,4.4,Appoids Tech Solutions,Android Developer,600000,3,Bangalore


In [4]:
#view the last few rows of the dataframe
df.tail()

Unnamed: 0,Rating,Company Name,Job Title,Salary,Salaries Reported,Location
22769,4.7,Expert Solutions,Web Developer,200000,1,Bangalore
22770,4.0,Nextgen Innovation Labs,Web Developer,300000,1,Bangalore
22771,4.1,Fresher,Full Stack Web Developer,192000,13,Bangalore
22772,4.1,Accenture,Full Stack Web Developer,300000,7,Bangalore
22773,3.8,Thomson Reuters,Associate Web Developer,300000,7,Bangalore


In [5]:
#check the dimensions of the dataframe(rows,columns)
df.shape

(22774, 6)

In [6]:
#get the information about the dataframe(data types, missing values)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22774 entries, 0 to 22773
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Rating             22774 non-null  float64
 1   Company Name       22774 non-null  object 
 2   Job Title          22774 non-null  object 
 3   Salary             22774 non-null  int64  
 4   Salaries Reported  22774 non-null  int64  
 5   Location           22774 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 1.0+ MB


In [7]:
#To identify missing values,use the isnull() function to create boolean mask
#use the sum() function to count missing values in each column
df.isnull().sum()

Rating               0
Company Name         0
Job Title            0
Salary               0
Salaries Reported    0
Location             0
dtype: int64

In [8]:
# check the count of duplicate rows
df.duplicated().sum()

0

In [9]:
# Filter the dataset for job titles containing "Engineer"
engineering_jobs = df[df['Job Title'].str.contains("Engineer")]

# Count the number of engineering jobs
num_engineering_jobs = len(engineering_jobs)

# Print the number of engineering jobs
print("Number of engineering jobs:", num_engineering_jobs)


Number of engineering jobs: 10797


In [10]:
# Filter the dataset for job titles containing "Developer"
developer_jobs = df[df['Job Title'].str.contains("Developer")]

# Count the number of developer jobs
num_developer_jobs = len(developer_jobs)

# Print the number of developer jobs
print("Number of developer jobs:", num_developer_jobs)


Number of developer jobs: 10769


In [11]:
# Calculate the average salary
average_salary = df['Salary'].mean()
print("Average salary:", average_salary)


Average salary: 695360.621761658


In [12]:
#  statistical summary for each numerical column in the DataFrame.
df.describe()

Unnamed: 0,Rating,Salary,Salaries Reported
count,22774.0,22774.0,22774.0
mean,3.918249,695360.6,1.855625
std,0.519676,884326.3,6.823078
min,1.0,2112.0,1.0
25%,3.7,300000.0,1.0
50%,3.9,500000.0,1.0
75%,4.2,900000.0,1.0
max,5.0,90000000.0,361.0


In [13]:
# Count the number of unique companies in the "Company Name" column
num_companies = df['Company Name'].nunique()

# Print the number of unique companies
print("Number of unique companies:", num_companies)


Number of unique companies: 11263


In [14]:
# Count the number of unique job titles in the "Job Title" column
num_job_titles = df['Job Title'].nunique()

# Print the number of unique job titles
print("Number of unique job titles:", num_job_titles)


Number of unique job titles: 1084


In [15]:
# Group the data by job title and calculate the maximum salary for each job title
max_salary_by_title = df.groupby('Job Title')['Salary'].max()

# Find the job title with the highest salary
highest_salary_title = max_salary_by_title.idxmax()

# Get the maximum salary value
highest_salary = max_salary_by_title.max()

# Print the job title with the highest salary and the corresponding salary value
print("Job Title with the highest salary:", highest_salary_title)
print("Highest salary:", highest_salary)


Job Title with the highest salary: Software Development Engineer (SDE)
Highest salary: 90000000


In [16]:
# Count the number of unique locations in the "Location" column
num_locations = df['Location'].nunique()

# Print the number of unique locations
print("Number of unique locations:", num_locations)


Number of unique locations: 10


In [17]:
# Count the number of occurrences for each location
location_counts = df['Location'].value_counts()

# Get the number of unique locations
num_locations = len(location_counts)

# Print the number of unique locations and the corresponding location names
print("Number of unique locations:", num_locations)
print("Locations and their counts:")
print(location_counts)


Number of unique locations: 10
Locations and their counts:
Bangalore         8265
Hyderabad         4468
New Delhi         4176
Chennai           2458
Pune              2135
Mumbai             749
Kolkata            178
Madhya Pradesh     155
Kerala             108
Jaipur              82
Name: Location, dtype: int64


In [18]:
# Find the company with the highest rating
highest_rating_company = df.loc[df['Rating'].idxmax(), 'Company Name']

# Get the highest rating value
highest_rating = df['Rating'].max()

# Print the company with the highest rating and the corresponding rating value
print("Company with the highest rating:", highest_rating_company)
print("Highest rating:", highest_rating)


Company with the highest rating: powerplay app
Highest rating: 5.0


In [19]:
#find the company with the lowest rating
lowest_rating_company = df.loc[df['Rating'].idxmin(),'Company Name']

#Get the lowest rating value
lowest_rating = df['Rating'].min()

#print the company with lowest rating and the corresponding rating value
print("company with the lowest rating:",lowest_rating_company)
print("lowest rating:",lowest_rating)

company with the lowest rating: Pricyfy
lowest rating: 1.0


In [20]:
# Calculate the average rating for each company
average_ratings = df.groupby('Company Name')['Rating'].mean()

# Print the average rating for each company
print("Average ratings by company:")
print(average_ratings)


Average ratings by company:
Company Name
(X,Y,Z) Architecture & Design    4.00
(no)name                         4.00
-                                3.95
....                             4.00
.Kreate                          4.50
                                 ... 
zekeLabs                         4.40
Ás Formaturas                    3.60
Órama                            3.50
​App-Scoop                       4.50
‎eNotice Ninja Pluss             3.80
Name: Rating, Length: 11263, dtype: float64


In [21]:
# Calculate the average salary for each location
average_salary_by_location = df.groupby('Location')['Salary'].mean()

# Find the location with the highest average salary
location_with_highest_salary = average_salary_by_location.idxmax()

# Get the highest average salary value
highest_average_salary = average_salary_by_location.max()

# Print the location with the highest average salary and the corresponding salary value
print("Location with the highest average salary:", location_with_highest_salary)
print("Highest average salary:", highest_average_salary)


Location with the highest average salary: Mumbai
Highest average salary: 961180.3684913218


In [22]:
# Calculate the average salary for each location
average_salary_by_location = df.groupby('Location')['Salary'].mean()

# Find the location with the lowest average salary
location_with_lowest_salary = average_salary_by_location.idxmin()

# Get the lowest average salary value
lowest_average_salary = average_salary_by_location.min()

# Print the location with the lowest average salary and the corresponding salary value
print("Location with the lowest average salary:", location_with_lowest_salary)
print("Lowest average salary:", lowest_average_salary)


Location with the lowest average salary: Kerala
Lowest average salary: 553577.4814814815


In [23]:
# Count the number of jobs for each location
job_counts_by_location = df['Location'].value_counts()

# Find the location with the highest number of jobs
location_with_most_jobs = job_counts_by_location.idxmax()

# Get the count of jobs in the location with the highest number of jobs
most_jobs_count = job_counts_by_location.max()

# Print the location with the highest number of jobs and the corresponding job count
print("Location with the most jobs:", location_with_most_jobs)
print("Number of jobs:", most_jobs_count)


Location with the most jobs: Bangalore
Number of jobs: 8265


In [24]:
# Group the data by job title and location, and find the maximum salary for each group
max_salary_by_job_location = df.groupby(['Job Title', 'Location'])['Salary'].max()

# Reset the index to convert the grouped result into a DataFrame
max_salary_df = max_salary_by_job_location.reset_index()

# Sort the DataFrame by salary in descending order
max_salary_df_sorted = max_salary_df.sort_values('Salary', ascending=False)

# Iterate over each job title to find the location with the highest salary
for job_title in max_salary_df_sorted['Job Title'].unique():
    job_title_data = max_salary_df_sorted[max_salary_df_sorted['Job Title'] == job_title]
    highest_salary_location = job_title_data.iloc[0]['Location']
    highest_salary = job_title_data.iloc[0]['Salary']
    
    print("Job Title:", job_title)
    print("Location with the highest salary:", highest_salary_location)
    print("Highest Salary:", highest_salary)
    print()


Job Title: Software Development Engineer (SDE)
Location with the highest salary: New Delhi
Highest Salary: 90000000

Job Title: Oracle Database Administrator
Location with the highest salary: Bangalore
Highest Salary: 10000000

Job Title: Senior Front End Developer
Location with the highest salary: Pune
Highest Salary: 10000000

Job Title: Senior Java Developer
Location with the highest salary: Chennai
Highest Salary: 10000000

Job Title: Lead UI Designer, Magento Front-end Developer
Location with the highest salary: Bangalore
Highest Salary: 9900000

Job Title: Non Software Development Engineer
Location with the highest salary: Mumbai
Highest Salary: 9800000

Job Title: Software Development Engineer (SDE) II
Location with the highest salary: Hyderabad
Highest Salary: 9700000

Job Title: Best Buy Mobile Sales Associate
Location with the highest salary: Bangalore
Highest Salary: 9600000

Job Title: Software Development Engineer In Test (SDET)
Location with the highest salary: Pune
Highe

Job Title: Senior Staff Software Development Engineer
Location with the highest salary: Hyderabad
Highest Salary: 2700000

Job Title: Software Development Senior Engineer
Location with the highest salary: Hyderabad
Highest Salary: 2700000

Job Title: Software Development Engineer IV
Location with the highest salary: Hyderabad
Highest Salary: 2700000

Job Title: IT Software Development Engineer II
Location with the highest salary: Hyderabad
Highest Salary: 2700000

Job Title: Consultant Database Administrator
Location with the highest salary: Bangalore
Highest Salary: 2700000

Job Title: Mobile Developer
Location with the highest salary: Bangalore
Highest Salary: 2700000

Job Title: Software Development Engineer - Intern
Location with the highest salary: Kolkata
Highest Salary: 2700000

Job Title: Software Development Engineer -2
Location with the highest salary: Mumbai
Highest Salary: 2700000

Job Title: Sr IT Software Development Engineer in Test
Location with the highest salary: Hyde

Job Title: Technical Java Lead Developer
Location with the highest salary: Chennai
Highest Salary: 1300000

Job Title: Amazon Software Development Engineer I
Location with the highest salary: Hyderabad
Highest Salary: 1300000

Job Title: Android Applications Developer
Location with the highest salary: Bangalore
Highest Salary: 1300000

Job Title: Senior Database Programmer
Location with the highest salary: Bangalore
Highest Salary: 1300000

Job Title: Mssql Database Administrator
Location with the highest salary: Bangalore
Highest Salary: 1300000

Job Title: Web Developer - Contractor
Location with the highest salary: Bangalore
Highest Salary: 1300000

Job Title: Senior IOS Developer - Contractor
Location with the highest salary: New Delhi
Highest Salary: 1300000

Job Title: Senior Software Engineer- IOS Developer
Location with the highest salary: New Delhi
Highest Salary: 1300000

Job Title: Python Engineer
Location with the highest salary: Hyderabad
Highest Salary: 1300000

Job Title


Job Title: IOS Engineer - Intern
Location with the highest salary: Bangalore
Highest Salary: 240000

Job Title: Front End Developer - Angular
Location with the highest salary: Pune
Highest Salary: 240000

Job Title: Android App Development
Location with the highest salary: Bangalore
Highest Salary: 228000

Job Title: Trainee Junior Python Developer
Location with the highest salary: Chennai
Highest Salary: 228000

Job Title: Java Programmer - Intern
Location with the highest salary: Hyderabad
Highest Salary: 228000

Job Title: Java Software Developer - Intern
Location with the highest salary: Hyderabad
Highest Salary: 228000

Job Title: Jr Software Engineer Development in Test
Location with the highest salary: Mumbai
Highest Salary: 228000

Job Title: Backend Web Developer - Intern
Location with the highest salary: Hyderabad
Highest Salary: 228000

Job Title: Front End Development Manager
Location with the highest salary: Bangalore
Highest Salary: 228000

Job Title: Front End Chasier
L