In [1]:
import pandas as pd
#load dataset
df = pd.read_csv('/content/data_cleaning.csv')

In [2]:
# 1 Basic Data checks
print('Dataset Info:')
df.info()
print('\nSummary Statistics:')
print(df.describe())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Index        29 non-null     int64  
 1   Age          22 non-null     float64
 2   Salary       29 non-null     object 
 3   Rating       28 non-null     float64
 4   Location     29 non-null     object 
 5   Established  29 non-null     int64  
 6   Easy Apply   29 non-null     object 
dtypes: float64(2), int64(2), object(3)
memory usage: 1.7+ KB

Summary Statistics:
           Index        Age     Rating  Established
count  29.000000  22.000000  28.000000    29.000000
mean   14.000000  39.045455   3.528571  1638.620690
std     8.514693  16.134781   2.825133   762.079599
min     0.000000  13.000000  -1.000000    -1.000000
25%     7.000000  25.000000   1.050000  1935.000000
50%    14.000000  39.500000   4.200000  1984.000000
75%    21.000000  50.000000   5.400000  1999.000000
max    

In [3]:
df

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1
5,5,44.0,$77k-$89k,1.4,"India,In",1999,TRUE
6,6,21.0,$44k-$99k,0.0,"New York,Ny",-1,-1
7,7,44.0,$44k-$99k,-1.0,Australia Aus,-1,-1
8,8,35.0,$44k-$99k,5.4,"New York,Ny",-1,-1
9,9,22.0,$44k-$99k,7.7,"India,In",-1,TRUE


In [4]:
# 2. column adjustments: delete index column if it exists
if 'Index' in df.columns:
  df.drop(columns=['Index'], inplace=True)

In [5]:
# 3. handling missing values Replace NaN with 0
df.fillna(0, inplace=True)

In [6]:
# 4. Data Type corrections: convert Age to integer
if 'Age' in df.columns:
   df['Age'] = df['Age'].astype(int)

In [7]:
df

Unnamed: 0,Age,Salary,Rating,Location,Established,Easy Apply
0,44,$44k-$99k,5.4,"India,In",1999,TRUE
1,66,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,0,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,64,$44k-$99k,4.4,India In,1988,-1
4,25,$44k-$99k,6.4,Australia Aus,2002,-1
5,44,$77k-$89k,1.4,"India,In",1999,TRUE
6,21,$44k-$99k,0.0,"New York,Ny",-1,-1
7,44,$44k-$99k,-1.0,Australia Aus,-1,-1
8,35,$44k-$99k,5.4,"New York,Ny",-1,-1
9,22,$44k-$99k,7.7,"India,In",-1,TRUE


In [8]:
# 5. Salary column adjustments
if 'Salary' in df.columns:
  df[['Min_Salary', 'Max_Salary']] = df['Salary'].str.replace('$', '').str.replace('k', '000').str.split('-', expand=True)
  df['Min_Salary'] = df['Min_Salary'].astype(int)
  df['Max_Salary'] = df['Max_Salary'].astype(int)
  df.drop(columns=['Salary'], inplace=True)

**Rating Column Adjustments:**

In [9]:
# Remove the negative (-) signs from some ratings. Convert the Rating column to a float data type.
if 'Rating' in df.columns:
  df['Rating'] = df['Rating'].astype(str).str.replace('-', '')
  df['Rating'] = df['Rating'].astype(float)

In [10]:
# 7. Easy apply column adjustmments
if 'Easy Apply' in df.columns:
  df['Easy Apply'] = df['Easy Apply'].replace({"-1": False, "Yes": True}).astype(bool)

**Additional Data Cleaning**

In [11]:
# Save the cleanned dataset
df.to_csv("cleaned_data.csv", index=False)

#display the first few rows of cleaned data
print("\ncleaned Data Preview:")
print(df.head())


cleaned Data Preview:
   Age  Rating       Location  Established  Easy Apply  Min_Salary  Max_Salary
0   44     5.4       India,In         1999        True       44000       99000
1   66     3.5    New York,Ny         2002        True       55000       66000
2    0     1.0    New York,Ny           -1       False       77000       89000
3   64     4.4       India In         1988       False       44000       99000
4   25     6.4  Australia Aus         2002       False       44000       99000
