In [120]:
import pandas as pd
import matplotlib.pyplot as plt
# Specify the file path
file_path = "credit-card-fraud-data.csv"

# Read the CSV file
cleaned_data = pd.read_csv(file_path)

# Print the head of the DataFrame
cleaned_data.head()

Unnamed: 0,Number ID,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Victim Age Range,Dollar Loss
0,1,Saskatchewan,Merchandise,Other/unknown,Male,60-69,222.73
1,2,British Columbia,Vendor Fraud,Text message,Male,10-19,300.0
2,3,Yukon,Spear Phishing,Email,Male,70-79,1600.0
3,4,Alberta,Extortion,Internet-social network,Male,40-49,24000.0
4,5,Prince Edward Island,Merchandise,Internet,Male,30-39,11000.0


In [121]:
# 2. Province/State Distribution
num_transactions = len(cleaned_data)
print("Number of unique transactions:", num_transactions)

Number of unique transactions: 1000


In [122]:
# 2. Number of unique provinces/states
num_provinces_states = cleaned_data['Province/State'].nunique()
print("Number of unique provinces/states:", num_provinces_states)

Number of unique provinces/states: 20


In [123]:
# 3. Number of unique fraud and cybercrime thematic categories
num_fraud_categories = cleaned_data['Fraud and Cybercrime Thematic Categories'].nunique()
print("Number of unique fraud and cybercrime thematic categories:", num_fraud_categories)

Number of unique fraud and cybercrime thematic categories: 23


In [124]:
# 4. Number of unique solicitation methods
num_solicitation_methods = cleaned_data['Solicitation Method'].nunique()
print("Number of unique solicitation methods:", num_solicitation_methods)

Number of unique solicitation methods: 10


In [125]:
# 5. Gender distribution
gender_distribution = cleaned_data['Gender'].value_counts()
print("Gender distribution:")
print(gender_distribution)

Gender distribution:
Gender
Female    519
Male      481
Name: count, dtype: int64


In [126]:
# 7. Total dollar loss
total_dollar_loss = cleaned_data['Dollar Loss'].sum()
print("Total dollar loss:", total_dollar_loss)
# 8. Average dollar loss per transaction
average_dollar_loss_per_transaction = total_dollar_loss / num_transactions
print("Average dollar loss per transaction:", average_dollar_loss_per_transaction)

Total dollar loss: 10526722.71
Average dollar loss per transaction: 10526.72271


In [127]:
# 9. Maximum dollar loss
max_dollar_loss = cleaned_data['Dollar Loss'].max()
print("Maximum dollar loss:", max_dollar_loss)

Maximum dollar loss: 1082651.1


In [128]:
# 10. Minimum dollar loss
min_dollar_loss = cleaned_data['Dollar Loss'].min()
print("Minimum dollar loss:", min_dollar_loss)

Minimum dollar loss: 0.01


In [129]:
# Group data by solicitation method
grouped_data = cleaned_data.groupby('Victim Age Range')

# Summary statistics of victim age for each group
age_summary_stats = grouped_data['Solicitation Method'].describe()


# Display summary statistics
print("Summary statistics of victim age by solicitation method:")
print(age_summary_stats)

Summary statistics of victim age by solicitation method:
                 count unique                      top freq
Victim Age Range                                           
10-19               60      6                 Internet   18
100 +                1      1                    Email    1
20-29              243      7  Internet-social network   69
30-39              207      7                 Internet   71
40-49              157      7                 Internet   63
50-59              121      9                 Internet   39
60-69              117      7              Direct call   32
70-79               72      7                 Internet   24
80-89               19      7              Direct call    8
90-99                3      1              Direct call    3


In [130]:
# Group data by province/state
grouped_data = cleaned_data.groupby('Province/State')

# Count occurrences of each province/state
state_counts = grouped_data.size().reset_index(name='Count')

# Summary statistics of solicitation method for each group
solicitation_summary_stats = grouped_data['Solicitation Method'].describe()

# Reset index to make 'Province/State' a regular column
solicitation_summary_stats = solicitation_summary_stats.reset_index()

# Merge count of occurrences with summary statistics DataFrame
merged_data = pd.merge(state_counts, solicitation_summary_stats, on='Province/State')

# Sort the merged data by count in descending order
# sorted_merged_data = merged_data.sort_values(by='Count', ascending=False)

# Display sorted data
print("Data sorted by state count in descending order:")
print(merged_data)

Data sorted by state count in descending order:
               Province/State  Count count unique                      top  \
0                     Alberta     99    99      7                 Internet   
1            British Columbia    112   112      7  Internet-social network   
2                  California      2     2      1  Internet-social network   
3                    Colorado      1     1      1   Door to door/in person   
4                 Connecticut      1     1      1                    Radio   
5                    Manitoba     42    42      5  Internet-social network   
6                      Nevada      2     2      2                 Internet   
7               New Brunswick     14    14      4              Direct call   
8               New Hampshire      1     1      1                 Internet   
9                  New Jersey      2     2      1                 Internet   
10  Newfoundland And Labrador      9     9      5                 Internet   
11     North Wes

In [131]:
frequency = cleaned_data["Solicitation Method"].value_counts(normalize=True) * 100

print(frequency)

Solicitation Method
Internet                   31.4
Internet-social network    26.6
Direct call                15.8
Email                      12.8
Text message                7.9
Other/unknown               3.4
Door to door/in person      1.6
Mail                        0.2
Print                       0.2
Radio                       0.1
Name: proportion, dtype: float64


In [132]:
frequency2 = cleaned_data["Province/State"].value_counts(normalize=True) * 100

print(frequency2)

Province/State
Ontario                      43.9
Quebec                       21.1
British Columbia             11.2
Alberta                       9.9
Manitoba                      4.2
Nova Scotia                   3.3
Saskatchewan                  2.5
New Brunswick                 1.4
Newfoundland And Labrador     0.9
Prince Edward Island          0.2
Yukon                         0.2
California                    0.2
New Jersey                    0.2
Nevada                        0.2
Wisconsin                     0.1
North West Territories        0.1
Texas                         0.1
Connecticut                   0.1
Colorado                      0.1
New Hampshire                 0.1
Name: proportion, dtype: float64


In [133]:
frequency3 = cleaned_data["Fraud and Cybercrime Thematic Categories"].value_counts(normalize=True) * 100

print(frequency3)

Fraud and Cybercrime Thematic Categories
Merchandise                                   31.5
Vendor Fraud                                  13.4
Counterfeit Merchandise                        8.9
Extortion                                      7.8
Service                                        6.9
Investments                                    6.5
Romance                                        5.4
Job                                            4.8
Spear Phishing                                 3.6
Bank Investigator                              2.9
Emergency (Jail, Accident, Hospital, Help)     2.6
Loan                                           1.6
Prize                                          1.3
Unauthorized Charge                            0.6
Other                                          0.5
Timeshare                                      0.3
GRANT                                          0.3
False Billing                                  0.3
Vacation                                 

In [134]:
frequency4 = cleaned_data["Victim Age Range"].value_counts(normalize=True) * 100

print(frequency4)

Victim Age Range
20-29    24.3
30-39    20.7
40-49    15.7
50-59    12.1
60-69    11.7
70-79     7.2
10-19     6.0
80-89     1.9
90-99     0.3
100 +     0.1
Name: proportion, dtype: float64


In [135]:
frequency5 = cleaned_data["Gender"].value_counts(normalize=True) * 100

print(frequency5)

Gender
Female    51.9
Male      48.1
Name: proportion, dtype: float64
