# Exploration des données

## Importation librairies et dataset

In [1]:
import numpy as np
import pandas as pd
import pickle
import sweetviz as sv

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("datasets/dataset.pkl", "rb") as file:
    data = pickle.load(file)
file.close()

data.head()

Unnamed: 0,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,...,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv,Zip_2,NAICS_2,Franchised,RealEstate
0,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,1997-02-28,1997,84,...,0,N,Y,1,60000,48000,47,45,0,0
1,LANDMARK BAR & GRILLE (THE),NEW PARIS,IN,46526,1ST SOURCE BANK,IN,722410,1997-02-28,1997,60,...,0,N,Y,1,40000,32000,46,72,0,0
2,"WHITLOCK DDS, TODD M.",BLOOMINGTON,IN,47401,GRANT COUNTY STATE BANK,IN,621210,1997-02-28,1997,180,...,0,N,N,1,287000,215250,47,62,0,0
3,"BIG BUCKS PAWN & JEWELRY, LLC",BROKEN ARROW,OK,74012,1ST NATL BK & TR CO OF BROKEN,OK,0,1997-02-28,1997,60,...,0,N,Y,1,35000,28000,74,0,0,0
4,"ANASTASIA CONFECTIONS, INC.",ORLANDO,FL,32801,FLORIDA BUS. DEVEL CORP,FL,0,1997-02-28,1997,240,...,0,N,N,1,229000,229000,32,0,0,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 897167 entries, 0 to 899163
Data columns (total 25 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Name           897153 non-null  object        
 1   City           897137 non-null  object        
 2   State          897154 non-null  category      
 3   Zip            897167 non-null  object        
 4   Bank           895661 non-null  object        
 5   BankState      895654 non-null  category      
 6   NAICS          897167 non-null  object        
 7   ApprovalDate   897167 non-null  datetime64[ns]
 8   ApprovalFY     897167 non-null  category      
 9   Term           897167 non-null  int64         
 10  NoEmp          897167 non-null  int64         
 11  NewExist       897033 non-null  category      
 12  CreateJob      897167 non-null  int64         
 13  RetainedJob    897167 non-null  int64         
 14  FranchiseCode  897167 non-null  object        
 15  Urban

## Exploration avec SweetViz

In [4]:
# data["ApprovalFY"] = pd.to_numeric(data['ApprovalFY'], errors='coerce')

report = sv.analyze(data, "MIS_Status")
report.show_html("sv-reports/report.html", layout="vertical")

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:01 -> (00:00 left)


Report sv-reports/report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Autres explorations

In [5]:
subway_subset = data[data["Name"] == "SUBWAY"]

display(subway_subset.head())

subway_subset["MIS_Status"].value_counts()


Unnamed: 0,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,...,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv,Zip_2,NAICS_2,Franchised,RealEstate
24,SUBWAY,LITTLE ROCK,AR,72223,HOPE FCU,MS,722211,2006-02-07,2006,126,...,1,N,N,1,137300,116705,72,72,0,0
3156,SUBWAY,GLEN MILLS,PA,19342,"PNC BANK, NATIONAL ASSOCIATION",DE,722211,2006-02-10,2006,72,...,1,0,N,1,50000,25000,19,72,0,0
7461,SUBWAY,MANCHESTER,ME,4351,BANGOR SAVINGS BANK,ME,722211,2006-02-16,2006,84,...,1,0,N,1,102000,86700,43,72,1,0
8287,SUBWAY,NORWICH,CT,6360,BANK OF AMERICA NATL ASSOC,RI,722211,2004-09-08,2004,36,...,1,Y,N,1,47000,23500,63,72,1,0
10198,SUBWAY,ATHENS,TX,75751,COMMUNITY NATL BK & TR OF TEXA,TX,0,1997-04-01,1997,84,...,0,N,Y,1,77600,62080,75,0,0,0


MIS_Status
1    1203
0      66
Name: count, dtype: int64

In [6]:
pizza_subset = data[data["Name"] == "DOMINO'S PIZZA"]

display(pizza_subset.head())

Unnamed: 0,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,...,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv,Zip_2,NAICS_2,Franchised,RealEstate
2499,DOMINO'S PIZZA,SARASOTA,FL,34231,STEARNS BK NATL ASSOC,FL,0,1997-03-06,1997,60,...,0,N,N,1,260000,195000,34,0,1,0
10553,DOMINO'S PIZZA,SPRINGFIELD,MO,65802,EMPIRE BANK,MO,0,1997-03-28,1997,84,...,0,N,N,1,90000,72000,65,0,1,0
11440,DOMINO'S PIZZA,PONTE VERDE BEACH,FL,32082,OLD FLORIDA BANK,FL,0,1997-04-01,1997,85,...,0,0,N,1,90000,72000,32,0,1,0
12269,DOMINO'S PIZZA,GUNTERSVILLE,AL,35976,BANCORPSOUTH BANK,AL,0,1997-04-02,1997,60,...,0,N,Y,1,75000,60000,35,0,1,0
14721,DOMINO'S PIZZA,UNIONDALE,NY,11553,BANCO POPULAR NORTH AMERICA,NY,0,1997-04-08,1997,120,...,0,0,N,1,96000,76800,11,0,1,0


In [7]:
naics_subset = data[data["NAICS"] == 0]
display(naics_subset)

Unnamed: 0,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,...,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv,Zip_2,NAICS_2,Franchised,RealEstate
3,"BIG BUCKS PAWN & JEWELRY, LLC",BROKEN ARROW,OK,74012,1ST NATL BK & TR CO OF BROKEN,OK,0,1997-02-28,1997,60,...,0,N,Y,1,35000,28000,74,0,0,0
4,"ANASTASIA CONFECTIONS, INC.",ORLANDO,FL,32801,FLORIDA BUS. DEVEL CORP,FL,0,1997-02-28,1997,240,...,0,N,N,1,229000,229000,32,0,0,1
6,MIDDLE ATLANTIC SPORTS CO INC,UNION,NJ,7083,WELLS FARGO BANK NATL ASSOC,SD,0,1980-06-02,1980,45,...,0,N,N,0,600000,499998,70,0,0,0
9,INTEXT BUILDING SYS LLC,GLASTONBURY,CT,6073,WEBSTER BANK NATL ASSOC,CT,0,1997-02-28,1997,84,...,0,N,Y,1,70000,56000,60,0,0,0
13,"ORCHARD CAFE & BAKERY, INC.",SLATERSVILLE,RI,2876,CITIZENS BANK NATL ASSOC,RI,0,1997-02-28,1997,120,...,0,N,N,1,370000,277500,28,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899153,NORTH SHORE FLORAL,WOODBURY,NY,11797,FLUSHING BANK,NY,0,1997-02-27,1997,119,...,0,0,N,0,142000,106500,11,0,0,0
899154,"LITWIN LIVERY SERVICES, INC.",CAMPBELL,OH,44405,JPMORGAN CHASE BANK NATL ASSOC,IL,0,1997-02-27,1997,60,...,0,0,N,1,10000,5000,44,0,0,0
899158,SHADES WINDOW TINTING AUTO ALA,IRVING,TX,75062,LOANS FROM OLD CLOSED LENDERS,DC,0,1997-02-27,1997,84,...,0,N,Y,1,79000,63200,75,0,0,0
899162,"MARUTAMA HAWAII, INC.",HONOLULU,HI,96830,BANK OF HAWAII,HI,0,1997-02-27,1997,60,...,0,N,Y,0,75000,60000,96,0,0,0
