# **Setup & Configuration**

In [1]:
# 1. IMPORT LIBRARIES

# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# **Data Ingestion**

In [2]:
# 2. LOAD DATA
# Path to your CSV file
df= pd.read_csv("/content/global_freelancers_raw_data.csv")


# **Data Profiling / Initial Inspection**

In [3]:
# 2.1 QUICK SHAPE OVERVIEW
df.shape

(1000, 12)

In [4]:
# üîç 3. DATA EXPLORATION / OVERVIEW

# 3.1 Quick look
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   freelancer_ID        1000 non-null   object 
 1   name                 1000 non-null   object 
 2   gender               1000 non-null   object 
 3   age                  970 non-null    float64
 4   country              1000 non-null   object 
 5   language             1000 non-null   object 
 6   primary_skill        1000 non-null   object 
 7   years_of_experience  949 non-null    float64
 8   hourly_rate (USD)    906 non-null    object 
 9   rating               899 non-null    float64
 10  is_active            911 non-null    object 
 11  client_satisfaction  824 non-null    object 
dtypes: float64(3), object(9)
memory usage: 93.9+ KB


In [5]:
# Check for missing values in each column of the dataset

df.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
client_satisfaction,176
rating,101
hourly_rate (USD),94
is_active,89
years_of_experience,51
age,30
freelancer_ID,0
name,0
gender,0
primary_skill,0


In [6]:
df.duplicated()


Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
995,False
996,False
997,False
998,False


In [7]:
# Checking Column Names
df.columns

Index(['freelancer_ID', 'name', 'gender', 'age', 'country', 'language',
       'primary_skill', 'years_of_experience', 'hourly_rate (USD)', 'rating',
       'is_active', 'client_satisfaction'],
      dtype='object')

# **Data Cleaning & Wrangling**

In [8]:
# Convert all column names to lowercase for consistency and easier access
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")


In [51]:
# Display the updated list of column names
df.columns.values


array(['freelancer_id', 'name', 'gender', 'age', 'country', 'language',
       'primary_skill', 'years_of_experience', 'hourly_rate_(usd)',
       'rating', 'is_active', 'client_satisfaction'], dtype=object)

In [9]:
# find all categoical column
cat_cols = df.select_dtypes(include=['object']).columns


In [10]:
# Show all categoical column
df.columns

Index(['freelancer_id', 'name', 'gender', 'age', 'country', 'language',
       'primary_skill', 'years_of_experience', 'hourly_rate_(usd)', 'rating',
       'is_active', 'client_satisfaction'],
      dtype='object')

In [11]:
cat_cols

Index(['freelancer_id', 'name', 'gender', 'country', 'language',
       'primary_skill', 'hourly_rate_(usd)', 'is_active',
       'client_satisfaction'],
      dtype='object')

In [12]:
# Show only columns that actually have missing values
missing_cat = df[cat_cols].isnull().sum()
print(missing_cat[missing_cat > 0])

hourly_rate_(usd)       94
is_active               89
client_satisfaction    176
dtype: int64


In [13]:
#Filling the missed column with mode for all categorical column
for col in missing_cat[missing_cat > 0].index:
    mode_val = df[col].mode(dropna=True)
    if len(mode_val) > 0:
        df[col] = df[col].fillna(mode_val[0])
    else:
        df[col] = df[col].fillna("Unknown")

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   freelancer_id        1000 non-null   object 
 1   name                 1000 non-null   object 
 2   gender               1000 non-null   object 
 3   age                  970 non-null    float64
 4   country              1000 non-null   object 
 5   language             1000 non-null   object 
 6   primary_skill        1000 non-null   object 
 7   years_of_experience  949 non-null    float64
 8   hourly_rate_(usd)    1000 non-null   object 
 9   rating               899 non-null    float64
 10  is_active            1000 non-null   object 
 11  client_satisfaction  1000 non-null   object 
dtypes: float64(3), object(9)
memory usage: 93.9+ KB


In [15]:

num_cols = df.select_dtypes(include=['number']).columns

# Show numeric columns that actually have NaN values
missing_num = df[num_cols].isnull().sum()
print(missing_num[missing_num > 0])

age                     30
years_of_experience     51
rating                 101
dtype: int64


In [16]:
#Filling the missed column with mean for all numerical column
for col in missing_num[missing_num > 0].index:
    df[col] = df[col].fillna(df[col].mean())

In [17]:
# After  filled up again check for missing values in each column of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   freelancer_id        1000 non-null   object 
 1   name                 1000 non-null   object 
 2   gender               1000 non-null   object 
 3   age                  1000 non-null   float64
 4   country              1000 non-null   object 
 5   language             1000 non-null   object 
 6   primary_skill        1000 non-null   object 
 7   years_of_experience  1000 non-null   float64
 8   hourly_rate_(usd)    1000 non-null   object 
 9   rating               1000 non-null   float64
 10  is_active            1000 non-null   object 
 11  client_satisfaction  1000 non-null   object 
dtypes: float64(3), object(9)
memory usage: 93.9+ KB


In [18]:
df.isnull().sum()

Unnamed: 0,0
freelancer_id,0
name,0
gender,0
age,0
country,0
language,0
primary_skill,0
years_of_experience,0
hourly_rate_(usd),0
rating,0


# **Handle inconsistent values**

In [19]:
print(df['gender'].unique())

['f' 'FEMALE' 'male' 'F' 'female' 'm' 'MALE' 'Female' 'M' 'Male']


In [20]:
df['gender']=df['gender'].str.lower().str.strip()

In [21]:
print(df['gender'].unique())

['f' 'female' 'male' 'm']


In [22]:
df['gender']=df['gender'].replace({'m':'male','f':'female'})

In [23]:
print(df['gender'].unique())

['female' 'male']


In [24]:
print(df['is_active'].unique())

['0' '1' 'N' 'False' 'True' 'yes' 'Y' 'no']


In [25]:
df['is_active']=df['is_active'].str.lower().str.strip()

In [26]:
df['is_active'].unique()

array(['0', '1', 'n', 'false', 'true', 'yes', 'y', 'no'], dtype=object)

In [27]:
df['is_active']=df['is_active'].replace({'0':'no','1':'yes','y':'yes','n':'no','false':'no','true':'yes'})

In [28]:
df['is_active'].unique()

array(['no', 'yes'], dtype=object)

# **Checking all**

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   freelancer_id        1000 non-null   object 
 1   name                 1000 non-null   object 
 2   gender               1000 non-null   object 
 3   age                  1000 non-null   float64
 4   country              1000 non-null   object 
 5   language             1000 non-null   object 
 6   primary_skill        1000 non-null   object 
 7   years_of_experience  1000 non-null   float64
 8   hourly_rate_(usd)    1000 non-null   object 
 9   rating               1000 non-null   float64
 10  is_active            1000 non-null   object 
 11  client_satisfaction  1000 non-null   object 
dtypes: float64(3), object(9)
memory usage: 93.9+ KB


In [30]:
df['name'].value_counts().head(10)

Unnamed: 0_level_0,count
name,Unnamed: 1_level_1
Robert Evans,2
Jason Carter,2
Amanda Mitchell,2
Mary Brown,2
Lisa Johnson,2
Amy Lee,2
Matthew Smith,2
Sarah Ho,2
Elizabeth Phillips,1
Juan Elliott,1


In [31]:
df['name']

Unnamed: 0,name
0,Ms. Nicole Kidd
1,Vanessa Garcia
2,Juan Nelson
3,Amanda Spencer
4,Lynn Curtis DDS
...,...
995,Albert Wilcox
996,Cheryl Norris
997,Kathy Watkins
998,John Obrien


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   freelancer_id        1000 non-null   object 
 1   name                 1000 non-null   object 
 2   gender               1000 non-null   object 
 3   age                  1000 non-null   float64
 4   country              1000 non-null   object 
 5   language             1000 non-null   object 
 6   primary_skill        1000 non-null   object 
 7   years_of_experience  1000 non-null   float64
 8   hourly_rate_(usd)    1000 non-null   object 
 9   rating               1000 non-null   float64
 10  is_active            1000 non-null   object 
 11  client_satisfaction  1000 non-null   object 
dtypes: float64(3), object(9)
memory usage: 93.9+ KB


# **Change Datatpye**

In [33]:
print(df['age'].dtype)

float64


In [34]:
df['age'] = df['age'].astype(int)


In [35]:
print(df['age'].dtype)

int64


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   freelancer_id        1000 non-null   object 
 1   name                 1000 non-null   object 
 2   gender               1000 non-null   object 
 3   age                  1000 non-null   int64  
 4   country              1000 non-null   object 
 5   language             1000 non-null   object 
 6   primary_skill        1000 non-null   object 
 7   years_of_experience  1000 non-null   float64
 8   hourly_rate_(usd)    1000 non-null   object 
 9   rating               1000 non-null   float64
 10  is_active            1000 non-null   object 
 11  client_satisfaction  1000 non-null   object 
dtypes: float64(2), int64(1), object(9)
memory usage: 93.9+ KB


In [37]:
df['client_satisfaction'].unique()

array(['64%', '84%', '71%', '90%', '83%', '94%', '76%', '77%', '86%',
       '93%', '70%', '69%', '60%', '87%', '75%', '68%', '65%', '100%',
       '92', '89%', '62%', '82', '81%', '63%', '67%', '80%', '74%', '85%',
       '79%', '72%', '64', '88', '96%', '96', '81', '61%', '97%', '73%',
       '88%', '72', '92%', '82%', '93', '83', '78', '95%', '80', '87',
       '66%', '78%', '68', '91%', '97', '60', '70', '99%', '76', '86',
       '95', '74', '100', '73', '67', '77', '98%', '71', '85', '91', '94',
       '84', '90', '62', '65', '75', '63', '61', '66', '99', '79', '69',
       '89'], dtype=object)

In [38]:
df['client_satisfaction']

Unnamed: 0,client_satisfaction
0,64%
1,84%
2,71%
3,90%
4,83%
...,...
995,68%
996,82
997,94%
998,97


In [39]:
#To make it in same format
df['client_satisfaction'] = df['client_satisfaction'].str.rstrip('%') + '%'


In [40]:
df['client_satisfaction'].unique()

array(['64%', '84%', '71%', '90%', '83%', '94%', '76%', '77%', '86%',
       '93%', '70%', '69%', '60%', '87%', '75%', '68%', '65%', '100%',
       '92%', '89%', '62%', '82%', '81%', '63%', '67%', '80%', '74%',
       '85%', '79%', '72%', '88%', '96%', '61%', '97%', '73%', '78%',
       '95%', '66%', '91%', '99%', '98%'], dtype=object)

In [41]:
df['hourly_rate_(usd)'].unique()

array(['100', 'USD 100', '50', '$40', '30', '$30', 'USD 75', 'USD 40',
       '40', '$50', '75', 'USD 50', 'USD 30', '$20', '20', '$75', '$100',
       'USD 20'], dtype=object)

In [42]:
df['hourly_rate_(usd)'] = (
    df['hourly_rate_(usd)']
    .astype(str)
    .str.replace('USD', '', regex=False)
    .str.replace('$', '', regex=False)
    .str.strip()
    .astype(int)
)


In [43]:
df['hourly_rate_(usd)'].unique()

array([100,  50,  40,  30,  75,  20])

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   freelancer_id        1000 non-null   object 
 1   name                 1000 non-null   object 
 2   gender               1000 non-null   object 
 3   age                  1000 non-null   int64  
 4   country              1000 non-null   object 
 5   language             1000 non-null   object 
 6   primary_skill        1000 non-null   object 
 7   years_of_experience  1000 non-null   float64
 8   hourly_rate_(usd)    1000 non-null   int64  
 9   rating               1000 non-null   float64
 10  is_active            1000 non-null   object 
 11  client_satisfaction  1000 non-null   object 
dtypes: float64(2), int64(2), object(8)
memory usage: 93.9+ KB


In [45]:
df['rating'].unique()

array([2.51256952, 3.3       , 0.        , 1.5       , 4.8       ,
       2.4       , 3.1       , 4.6       , 4.        , 3.6       ,
       2.        , 2.9       , 3.7       , 1.2       , 2.5       ,
       1.1       , 1.8       , 4.7       , 1.3       , 1.        ,
       4.2       , 1.4       , 2.2       , 3.2       , 2.7       ,
       2.8       , 4.9       , 4.5       , 1.9       , 3.4       ,
       2.3       , 3.9       , 3.5       , 4.4       , 2.6       ,
       1.6       , 3.8       , 1.7       , 5.        , 3.        ,
       4.1       , 4.3       , 2.1       ])

In [46]:
df['rating'] = df['rating'].round(1)


In [47]:
df['rating'].unique()

array([2.5, 3.3, 0. , 1.5, 4.8, 2.4, 3.1, 4.6, 4. , 3.6, 2. , 2.9, 3.7,
       1.2, 1.1, 1.8, 4.7, 1.3, 1. , 4.2, 1.4, 2.2, 3.2, 2.7, 2.8, 4.9,
       4.5, 1.9, 3.4, 2.3, 3.9, 3.5, 4.4, 2.6, 1.6, 3.8, 1.7, 5. , 3. ,
       4.1, 4.3, 2.1])

In [48]:
#To make all value in same format for that particular column
np.set_printoptions(formatter={'float': '{:0.1f}'.format})

df['rating'].unique()


array([2.5, 3.3, 0.0, 1.5, 4.8, 2.4, 3.1, 4.6, 4.0, 3.6, 2.0, 2.9, 3.7,
       1.2, 1.1, 1.8, 4.7, 1.3, 1.0, 4.2, 1.4, 2.2, 3.2, 2.7, 2.8, 4.9,
       4.5, 1.9, 3.4, 2.3, 3.9, 3.5, 4.4, 2.6, 1.6, 3.8, 1.7, 5.0, 3.0,
       4.1, 4.3, 2.1])

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   freelancer_id        1000 non-null   object 
 1   name                 1000 non-null   object 
 2   gender               1000 non-null   object 
 3   age                  1000 non-null   int64  
 4   country              1000 non-null   object 
 5   language             1000 non-null   object 
 6   primary_skill        1000 non-null   object 
 7   years_of_experience  1000 non-null   float64
 8   hourly_rate_(usd)    1000 non-null   int64  
 9   rating               1000 non-null   float64
 10  is_active            1000 non-null   object 
 11  client_satisfaction  1000 non-null   object 
dtypes: float64(2), int64(2), object(8)
memory usage: 93.9+ KB


In [50]:
# Export Final DataFrame to CSV
df.to_csv('global_freelancers_clean_data.csv', index=False)