In [37]:
import pandas as pd

# Specify the file path
file_path = "Resources\credit-card-fraud-data-original.csv"

# Read the CSV file
data = pd.read_csv(file_path)

# Print the head of the DataFrame
data.head()

Unnamed: 0,_id,Numero d'identification / Number ID,Date Received / Date recue,Complaint Received Type,Type de plainte recue,Country,Pays,Province/State,Province/Etat,Fraud and Cybercrime Thematic Categories,...,Methode de sollicitation,Gender,Genre,Language of Correspondence,Langue de correspondance,Victim Age Range / Tranche d'age des victimes,Complaint Type,Type de plainte,Number of Victims / Nombre de victimes,Dollar Loss /pertes financieres
0,262712,1,2021-01-02,CAFC Website,CAFC site web,Canada,Canada,Saskatchewan,Saskatchewan,Merchandise,...,Autre/inconnu,Not Available,non disponible,Not Available,non disponible,'Not Available / non disponible,Attempt,Tentative,0,$0.00
1,262713,2,2021-01-02,CAFC Website,CAFC site web,Not Specified,Non spécifié,Not Specified,Non spécifié,Merchandise,...,Internet,Not Available,non disponible,Not Available,non disponible,'Not Available / non disponible,Victim,Victime,1,"$1,000.00"
2,262714,3,2021-01-02,CAFC Website,CAFC site web,Canada,Canada,Quebec,Québec,Identity Fraud,...,Autre/inconnu,Male,Homme,French,Français,'40 - 49,Victim,Victime,1,$0.00
3,262715,4,2021-01-02,CAFC Website,CAFC site web,Canada,Canada,Saskatchewan,Saskatchewan,Phishing,...,Courriel,Male,Homme,English,Anglais,'30 - 39,Victim,Victime,1,$0.00
4,262716,5,2021-01-02,CAFC Website,CAFC site web,Canada,Canada,Saskatchewan,Saskatchewan,Merchandise,...,Autre/inconnu,Male,Homme,Not Available,non disponible,'60 - 69,Victim,Victime,1,$222.73


In [38]:
# List of columns to be removed
columns_to_remove = ["_id", "Date Received / Date recue", "Complaint Received Type", 
                     "Type de plainte recue", "Pays", "Province/Etat", 
                     "Categories thematiques sur la fraude et la cybercriminalite", 
                     "Methode de sollicitation", "Genre", "Language of Correspondence", 
                     "Langue de correspondance", "Complaint Type", "Type de plainte", 
                     "Number of Victims / Nombre de victimes", "Country"]

# Remove the columns
data = data.drop(columns=columns_to_remove)

# Print the head of the cleaned DataFrame
data.head()

Unnamed: 0,Numero d'identification / Number ID,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Victim Age Range / Tranche d'age des victimes,Dollar Loss /pertes financieres
0,1,Saskatchewan,Merchandise,Other/unknown,Not Available,'Not Available / non disponible,$0.00
1,2,Not Specified,Merchandise,Internet,Not Available,'Not Available / non disponible,"$1,000.00"
2,3,Quebec,Identity Fraud,Other/unknown,Male,'40 - 49,$0.00
3,4,Saskatchewan,Phishing,Email,Male,'30 - 39,$0.00
4,5,Saskatchewan,Merchandise,Other/unknown,Male,'60 - 69,$222.73


In [39]:
# Remove rows where "Dollar Loss /pertes financieres" is "$0.00"
data = data[data["Dollar Loss /pertes financieres"] != "$0.00"]

# Print the head of the cleaned DataFrame
data.head()

Unnamed: 0,Numero d'identification / Number ID,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Victim Age Range / Tranche d'age des victimes,Dollar Loss /pertes financieres
1,2,Not Specified,Merchandise,Internet,Not Available,'Not Available / non disponible,"$1,000.00"
4,5,Saskatchewan,Merchandise,Other/unknown,Male,'60 - 69,$222.73
7,8,British Columbia,Vendor Fraud,Text message,Male,'10 - 19,$300.00
12,13,Not Specified,Merchandise,Internet,Not Available,'Not Available / non disponible,$269.17
14,15,Yukon,Spear Phishing,Email,Male,'70 - 79,"$1,600.00"


In [40]:
# Rename the columns
data = data.rename(columns={
    "Numero d'identification / Number ID": "Number ID",
    "Victim Age Range / Tranche d'age des victimes": "Victim Age Range",
    "Dollar Loss /pertes financieres": "Dollar Loss"
})

# Print the head of the DataFrame with renamed columns
data.head()

Unnamed: 0,Number ID,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Victim Age Range,Dollar Loss
1,2,Not Specified,Merchandise,Internet,Not Available,'Not Available / non disponible,"$1,000.00"
4,5,Saskatchewan,Merchandise,Other/unknown,Male,'60 - 69,$222.73
7,8,British Columbia,Vendor Fraud,Text message,Male,'10 - 19,$300.00
12,13,Not Specified,Merchandise,Internet,Not Available,'Not Available / non disponible,$269.17
14,15,Yukon,Spear Phishing,Email,Male,'70 - 79,"$1,600.00"


In [41]:
# Replace the specific values with NaN
data.replace(["Not Specified", "Not Available", "Unknown", "'Not Available / non disponible"], pd.NA, inplace=True)

# Drop the rows where at least one element is missing
data.dropna(inplace=True)

data.head()

Unnamed: 0,Number ID,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Victim Age Range,Dollar Loss
4,5,Saskatchewan,Merchandise,Other/unknown,Male,'60 - 69,$222.73
7,8,British Columbia,Vendor Fraud,Text message,Male,'10 - 19,$300.00
14,15,Yukon,Spear Phishing,Email,Male,'70 - 79,"$1,600.00"
35,36,Alberta,Extortion,Internet-social network,Male,'40 - 49,"$24,000.00"
39,40,Prince Edward Island,Merchandise,Internet,Male,'30 - 39,"$11,000.00"


In [42]:
# Remove all the ' in the "Victim Age Range" column
data["Victim Age Range"] = data["Victim Age Range"].str.replace("'", "")

data.head()

Unnamed: 0,Number ID,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Victim Age Range,Dollar Loss
4,5,Saskatchewan,Merchandise,Other/unknown,Male,60 - 69,$222.73
7,8,British Columbia,Vendor Fraud,Text message,Male,10 - 19,$300.00
14,15,Yukon,Spear Phishing,Email,Male,70 - 79,"$1,600.00"
35,36,Alberta,Extortion,Internet-social network,Male,40 - 49,"$24,000.00"
39,40,Prince Edward Island,Merchandise,Internet,Male,30 - 39,"$11,000.00"


In [43]:
# removing spaces from "Victim Age Range"
data["Victim Age Range"]  = data["Victim Age Range"] .str.replace(" - ", "-")
data.head()

Unnamed: 0,Number ID,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Victim Age Range,Dollar Loss
4,5,Saskatchewan,Merchandise,Other/unknown,Male,60-69,$222.73
7,8,British Columbia,Vendor Fraud,Text message,Male,10-19,$300.00
14,15,Yukon,Spear Phishing,Email,Male,70-79,"$1,600.00"
35,36,Alberta,Extortion,Internet-social network,Male,40-49,"$24,000.00"
39,40,Prince Edward Island,Merchandise,Internet,Male,30-39,"$11,000.00"


In [44]:
# Modify the "Number ID" column
data["Number ID"] = range(1, len(data) + 1)

# Print the head of the DataFrame with the modified "Number ID" column
data.head()

Unnamed: 0,Number ID,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Victim Age Range,Dollar Loss
4,1,Saskatchewan,Merchandise,Other/unknown,Male,60-69,$222.73
7,2,British Columbia,Vendor Fraud,Text message,Male,10-19,$300.00
14,3,Yukon,Spear Phishing,Email,Male,70-79,"$1,600.00"
35,4,Alberta,Extortion,Internet-social network,Male,40-49,"$24,000.00"
39,5,Prince Edward Island,Merchandise,Internet,Male,30-39,"$11,000.00"


In [45]:
# Reset the index and drop the old one
data = data.reset_index(drop=True)

# Print the head of the DataFrame with the modified "Number ID" column
data.head()

Unnamed: 0,Number ID,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Victim Age Range,Dollar Loss
0,1,Saskatchewan,Merchandise,Other/unknown,Male,60-69,$222.73
1,2,British Columbia,Vendor Fraud,Text message,Male,10-19,$300.00
2,3,Yukon,Spear Phishing,Email,Male,70-79,"$1,600.00"
3,4,Alberta,Extortion,Internet-social network,Male,40-49,"$24,000.00"
4,5,Prince Edward Island,Merchandise,Internet,Male,30-39,"$11,000.00"


In [46]:
# Select the first 1000 rows (0-indexed)
data = data.iloc[:1000]

In [47]:
# checking datatypes of each field
data.dtypes

Number ID                                    int64
Province/State                              object
Fraud and Cybercrime Thematic Categories    object
Solicitation Method                         object
Gender                                      object
Victim Age Range                            object
Dollar Loss                                 object
dtype: object

In [48]:
# Removing dollar symbol from field "Dollar Loss" 
data["Dollar Loss"] = data["Dollar Loss"].str.replace("$", "")
data["Dollar Loss"] = data["Dollar Loss"].str.replace(",", "")

In [49]:
# Cating "Dollar Loss" field to float 
data["Dollar Loss"]=data["Dollar Loss"].astype(float)

In [50]:
data.head(5)

Unnamed: 0,Number ID,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Victim Age Range,Dollar Loss
0,1,Saskatchewan,Merchandise,Other/unknown,Male,60-69,222.73
1,2,British Columbia,Vendor Fraud,Text message,Male,10-19,300.0
2,3,Yukon,Spear Phishing,Email,Male,70-79,1600.0
3,4,Alberta,Extortion,Internet-social network,Male,40-49,24000.0
4,5,Prince Edward Island,Merchandise,Internet,Male,30-39,11000.0


In [51]:
data.dtypes

Number ID                                     int64
Province/State                               object
Fraud and Cybercrime Thematic Categories     object
Solicitation Method                          object
Gender                                       object
Victim Age Range                             object
Dollar Loss                                 float64
dtype: object

In [53]:
# Export the DataFrame to a CSV file
data.to_csv("Output/credit-card-fraud-data.csv", index=False)