# Exploration des données

## Importation librairies et dataset

In [1]:
import numpy as np
import pandas as pd
import sweetviz as sv

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("datasets/SBAnational-cleaned.csv")

## Exploration avec SweetViz

In [3]:
mapping = {"CHGOFF": 0, "P I F": 1}

data["MIS_Status"] = data["MIS_Status"].map(mapping)

data["ApprovalFY"] = pd.to_numeric(data['ApprovalFY'], errors='coerce')

report = sv.analyze(data, "MIS_Status")
report.show_html("sv-reports/report.html", layout="vertical")

Feature: Name                                |▉         | [ 10%]   00:03 -> (00:22 left)

KeyboardInterrupt: 

## Autres explorations

In [14]:
subway_subset = data[data["Name"] == "SUBWAY"]

display(subway_subset)

subway_subset["MIS_Status"].value_counts()
subway_subset["FranchiseCode"].value_counts()


Unnamed: 0,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv
24,SUBWAY,LITTLE ROCK,AR,72223,HOPE FCU,MS,722211,7-Feb-06,2006,126,7,1.0,0,0,1,1,N,N,1,137300
3145,SUBWAY,GLEN MILLS,PA,19342,"PNC BANK, NATIONAL ASSOCIATION",DE,722211,10-Feb-06,2006,72,3,2.0,1,3,1,1,0,N,1,50000
7444,SUBWAY,MANCHESTER,ME,4351,BANGOR SAVINGS BANK,ME,722211,16-Feb-06,2006,84,2,1.0,1,2,78760,1,0,N,1,102000
8269,SUBWAY,NORWICH,CT,6360,BANK OF AMERICA NATL ASSOC,RI,722211,8-Sep-04,2004,36,10,2.0,0,0,78760,1,Y,N,1,47000
10176,SUBWAY,ATHENS,TX,75751,COMMUNITY NATL BK & TR OF TEXA,TX,0,1-Apr-97,1997,84,23,1.0,0,0,1,0,N,Y,1,77600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894076,SUBWAY,NASHVILLE,TN,37219,FIRST TENNESSEE BANK NATL ASSO,TN,0,29-Jan-97,1997,60,5,2.0,0,0,78760,0,N,Y,1,72000
896208,SUBWAY,MEMPHIS,TN,38118,TRI-STATE BANK OF MEMPHIS,TN,0,20-Feb-97,1997,84,5,2.0,0,0,78760,0,N,Y,1,100000
896285,SUBWAY,EDMONDS,WA,98026,WELLS FARGO BANK NATL ASSOC,SD,0,20-Feb-97,1997,84,5,1.0,0,0,78760,0,0,N,1,200000
896632,SUBWAY,DURANT,OK,74701,LANDMARK BANK NATL ASSOC,OK,0,24-Feb-97,1997,84,20,1.0,0,0,78760,0,N,Y,1,100000


FranchiseCode
78760    1114
1         144
78759       5
0           3
21425       1
78755       1
78640       1
Name: count, dtype: int64

In [13]:
pizza_subset = data[data["Name"] == "DOMINO'S PIZZA"]

display(pizza_subset)

pizza_subset["FranchiseCode"].value_counts()

Unnamed: 0,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv
2490,DOMINO'S PIZZA,SARASOTA,FL,34231,STEARNS BK NATL ASSOC,FL,0,6-Mar-97,1997,60,50,1.0,0,0,24850,0,N,N,1,260000
10530,DOMINO'S PIZZA,SPRINGFIELD,MO,65802,EMPIRE BANK,MO,0,28-Mar-97,1997,84,109,1.0,0,0,24850,0,N,N,1,90000
11416,DOMINO'S PIZZA,PONTE VERDE BEACH,FL,32082,OLD FLORIDA BANK,FL,0,1-Apr-97,1997,85,50,1.0,0,0,24850,0,0,N,1,90000
12244,DOMINO'S PIZZA,GUNTERSVILLE,AL,35976,BANCORPSOUTH BANK,AL,0,2-Apr-97,1997,60,12,1.0,0,0,24850,0,N,Y,1,75000
14694,DOMINO'S PIZZA,UNIONDALE,NY,11553,BANCO POPULAR NORTH AMERICA,NY,0,8-Apr-97,1997,120,12,2.0,0,0,24850,0,0,N,1,96000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890531,DOMINO'S PIZZA,LENEXA,KS,66215,U.S. BANK NATIONAL ASSOCIATION,KS,0,20-Dec-96,1997,60,10,1.0,0,0,24850,0,N,Y,1,35000
891237,DOMINO'S PIZZA,GREENSBORO,NC,27406,BANK OF AMERICA NATL ASSOC,MD,0,30-Dec-96,1997,60,48,1.0,0,0,24850,0,0,N,1,200000
892910,DOMINO'S PIZZA,WATERLOO,IL,62298,MONTGOMERY BANK NATL ASSOC,MO,0,16-Jan-97,1997,84,13,2.0,0,0,24850,0,N,Y,1,50000
893716,DOMINO'S PIZZA,MT. AIRY,MD,21771,BRANCH BK. & TR CO,MD,0,24-Jan-97,1997,120,122,1.0,0,0,1,0,N,N,1,251000


FranchiseCode
24850    279
1         44
0          3
25000      1
24902      1
Name: count, dtype: int64

In [10]:
naics_subset = data[data["NAICS"] == 0]
display(naics_subset)

Unnamed: 0,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv
3,"BIG BUCKS PAWN & JEWELRY, LLC",BROKEN ARROW,OK,74012,1ST NATL BK & TR CO OF BROKEN,OK,0,28-Feb-97,1997,60,2,1.0,0,0,1,0,N,Y,1,35000
4,"ANASTASIA CONFECTIONS, INC.",ORLANDO,FL,32801,FLORIDA BUS. DEVEL CORP,FL,0,28-Feb-97,1997,240,14,1.0,7,7,1,0,N,N,1,229000
6,MIDDLE ATLANTIC SPORTS CO INC,UNION,NJ,7083,WELLS FARGO BANK NATL ASSOC,SD,0,2-Jun-80,1980,45,45,2.0,0,0,0,0,N,N,0,600000
9,INTEXT BUILDING SYS LLC,GLASTONBURY,CT,6073,WEBSTER BANK NATL ASSOC,CT,0,28-Feb-97,1997,84,3,2.0,0,0,1,0,N,Y,1,70000
13,"ORCHARD CAFE & BAKERY, INC.",SLATERSVILLE,RI,2876,CITIZENS BANK NATL ASSOC,RI,0,28-Feb-97,1997,120,2,2.0,0,0,1,0,N,N,1,370000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
897156,NORTH SHORE FLORAL,WOODBURY,NY,11797,FLUSHING BANK,NY,0,27-Feb-97,1997,119,5,1.0,0,0,1,0,0,N,0,142000
897157,"LITWIN LIVERY SERVICES, INC.",CAMPBELL,OH,44405,JPMORGAN CHASE BANK NATL ASSOC,IL,0,27-Feb-97,1997,60,1,1.0,0,0,1,0,0,N,1,10000
897161,SHADES WINDOW TINTING AUTO ALA,IRVING,TX,75062,LOANS FROM OLD CLOSED LENDERS,DC,0,27-Feb-97,1997,84,5,2.0,0,0,1,0,N,Y,1,79000
897165,"MARUTAMA HAWAII, INC.",HONOLULU,HI,96830,BANK OF HAWAII,HI,0,27-Feb-97,1997,60,6,1.0,0,0,1,0,N,Y,0,75000
