In [2]:
# API Requests
import pprint
import requests
import json
import time

# Data Science
import pandas as pd
import numpy as np
import scipy.stats as sc
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Geocoding
from api_keys import opencage_key
from api_keys import geoapify_key


In [3]:
# Create a file path
filepath_cards = "../Resources/cards_data.csv"
filepath_users = "../Resources/users_data.csv"
# Read in the data.
df_cards = pd.read_csv(filepath_cards)
df_users = pd.read_csv(filepath_users)

In [6]:
df_cards.info()
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6146 entries, 0 to 6145
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     6146 non-null   int64 
 1   client_id              6146 non-null   int64 
 2   card_brand             6146 non-null   object
 3   card_type              6146 non-null   object
 4   card_number            6146 non-null   int64 
 5   expires                6146 non-null   object
 6   cvv                    6146 non-null   int64 
 7   has_chip               6146 non-null   object
 8   num_cards_issued       6146 non-null   int64 
 9   credit_limit           6146 non-null   object
 10  acct_open_date         6146 non-null   object
 11  year_pin_last_changed  6146 non-null   int64 
 12  card_on_dark_web       6146 non-null   object
dtypes: int64(6), object(7)
memory usage: 624.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Da

In [8]:
# Merging noth files on ID column
df = pd.merge(df_cards, df_users, left_on="client_id", right_on="id", how="inner")
df.head()

Unnamed: 0,id_x,client_id,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,...,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards
0,4524,825,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,...,11,Female,462 Rose Lane,34.15,-117.76,$29278,$59696,$127613,787,5
1,2731,825,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,...,11,Female,462 Rose Lane,34.15,-117.76,$29278,$59696,$127613,787,5
2,3701,825,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,...,11,Female,462 Rose Lane,34.15,-117.76,$29278,$59696,$127613,787,5
3,42,825,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,...,11,Female,462 Rose Lane,34.15,-117.76,$29278,$59696,$127613,787,5
4,4659,825,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,...,11,Female,462 Rose Lane,34.15,-117.76,$29278,$59696,$127613,787,5


In [10]:
# Removing dollar sign from monetary columlns
df["credit_limit"] = df.credit_limit.str.strip("$")
df["per_capita_income"] = df.per_capita_income.str.strip("$")
df["yearly_income"] = df.yearly_income.str.strip("$")
df["total_debt"] = df.total_debt.str.strip("$")
df.head()

Unnamed: 0,id_x,client_id,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,...,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards
0,4524,825,Visa,Debit,4344676511950444,12/2022,623,YES,2,24295,...,11,Female,462 Rose Lane,34.15,-117.76,29278,59696,127613,787,5
1,2731,825,Visa,Debit,4956965974959986,12/2020,393,YES,2,21968,...,11,Female,462 Rose Lane,34.15,-117.76,29278,59696,127613,787,5
2,3701,825,Visa,Debit,4582313478255491,02/2024,719,YES,2,46414,...,11,Female,462 Rose Lane,34.15,-117.76,29278,59696,127613,787,5
3,42,825,Visa,Credit,4879494103069057,08/2024,693,NO,1,12400,...,11,Female,462 Rose Lane,34.15,-117.76,29278,59696,127613,787,5
4,4659,825,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,28,...,11,Female,462 Rose Lane,34.15,-117.76,29278,59696,127613,787,5


In [12]:
# Converting objects to int value
df["credit_limit"] = df["credit_limit"].astype(int)
df["per_capita_income"] = df["per_capita_income"].astype(int)
df["yearly_income"] = df["yearly_income"].astype(int)
df["total_debt"] =df["total_debt"].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6146 entries, 0 to 6145
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id_x                   6146 non-null   int64  
 1   client_id              6146 non-null   int64  
 2   card_brand             6146 non-null   object 
 3   card_type              6146 non-null   object 
 4   card_number            6146 non-null   int64  
 5   expires                6146 non-null   object 
 6   cvv                    6146 non-null   int64  
 7   has_chip               6146 non-null   object 
 8   num_cards_issued       6146 non-null   int64  
 9   credit_limit           6146 non-null   int32  
 10  acct_open_date         6146 non-null   object 
 11  year_pin_last_changed  6146 non-null   int64  
 12  card_on_dark_web       6146 non-null   object 
 13  id_y                   6146 non-null   int64  
 14  current_age            6146 non-null   int64  
 15  reti

In [14]:
# Remove duplicates by client_id
df2 = df.drop_duplicates(subset='client_id', keep='first')

# Checking the result after removing duplicates
df2.head()
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 0 to 6144
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id_x                   2000 non-null   int64  
 1   client_id              2000 non-null   int64  
 2   card_brand             2000 non-null   object 
 3   card_type              2000 non-null   object 
 4   card_number            2000 non-null   int64  
 5   expires                2000 non-null   object 
 6   cvv                    2000 non-null   int64  
 7   has_chip               2000 non-null   object 
 8   num_cards_issued       2000 non-null   int64  
 9   credit_limit           2000 non-null   int32  
 10  acct_open_date         2000 non-null   object 
 11  year_pin_last_changed  2000 non-null   int64  
 12  card_on_dark_web       2000 non-null   object 
 13  id_y                   2000 non-null   int64  
 14  current_age            2000 non-null   int64  
 15  retiremen

In [16]:
# Drop the specified columns
df2 = df2.drop(columns=['id_x', 'card_brand', 'card_type', 'card_number', 'expires', 'cvv', 'has_chip', 'num_cards_issued', 'credit_limit', 'acct_open_date', 'year_pin_last_changed', 'card_on_dark_web', 'id_y', 'retirement_age'])

# Checking the result after dropping the columns
df2.head()

Unnamed: 0,client_id,current_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards
0,825,53,1966,11,Female,462 Rose Lane,34.15,-117.76,29278,59696,127613,787,5
5,1746,53,1966,12,Female,3606 Federal Boulevard,40.76,-73.74,37891,77254,191349,701,5
10,1718,81,1938,11,Female,766 Third Drive,34.02,-117.89,22681,33483,196,698,5
15,708,63,1957,1,Female,3 Madison Street,40.71,-73.99,163145,249925,202328,722,4
19,1164,43,1976,9,Male,9620 Valley Stream Drive,37.76,-122.44,53797,109687,183855,675,1


In [18]:
# Create new Debt to Income column

df2['debt_to_income'] = df2['total_debt'] / df2['yearly_income']
df2

Unnamed: 0,client_id,current_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,debt_to_income
0,825,53,1966,11,Female,462 Rose Lane,34.15,-117.76,29278,59696,127613,787,5,2.137714
5,1746,53,1966,12,Female,3606 Federal Boulevard,40.76,-73.74,37891,77254,191349,701,5,2.476881
10,1718,81,1938,11,Female,766 Third Drive,34.02,-117.89,22681,33483,196,698,5,0.005854
15,708,63,1957,1,Female,3 Madison Street,40.71,-73.99,163145,249925,202328,722,4,0.809555
19,1164,43,1976,9,Male,9620 Valley Stream Drive,37.76,-122.44,53797,109687,183855,675,1,1.676179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6133,986,32,1987,7,Male,6577 Lexington Lane,40.65,-73.58,23550,48010,87837,703,3,1.829556
6136,1944,62,1957,11,Female,2 Elm Drive,38.95,-84.54,24218,49378,104480,740,4,2.115922
6140,185,47,1973,1,Female,276 Fifth Boulevard,40.66,-74.19,15175,30942,71066,779,3,2.296749
6143,1007,66,1954,2,Male,259 Valley Boulevard,40.24,-76.92,25336,54654,27241,618,1,0.498426


In [None]:
# Modify the reverse_geocode function to accept latitude and longitude from each row
def reverse_geocode(lat, lng, api_key):
    base_url = "https://api.opencagedata.com/geocode/v1/json"
    params = {
        "q": f"{lat},{lng}",
        "key": api_key,  # Using the API key passed from the imported file
        "language": "en",
        "pretty": 1
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        if data['results']:
            # Returning the formatted address
            return data['results'][0]['formatted']
        else:
            return "No results found"
    else:
        return f"Error: {response.status_code}"

# Assuming df2 is already loaded as per your code, apply the reverse_geocode function row-wise
df2['address'] = df2.apply(lambda row: reverse_geocode(row['latitude'], row['longitude'], opencage_key), axis=1)

# Check the result
df2.head()

In [None]:
df2

In [None]:
# Save the full DataFrame to a new CSV file
df2.to_csv('full_dataframe.csv', index=False