# Analyse collected data

In [21]:
%load_ext autoreload
%autoreload 2

## Import packages

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load data

In [2]:
data = pd.read_csv("../data/aides_v2.csv")

## Overview

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2637 entries, 0 to 2636
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   name               2637 non-null   object
 1   url                2637 non-null   object
 2   error_orig         2637 non-null   bool  
 3   type_error_orig    847 non-null    object
 4   error_app          2637 non-null   bool  
 5   type_error_app     2211 non-null   object
 6   pdfs               2637 non-null   object
 7   pdf_avec_criteres  2637 non-null   bool  
dtypes: bool(3), object(5)
memory usage: 110.9+ KB


In [3]:
data.head()

Unnamed: 0,name,url,error_orig,type_error_orig,error_app,type_error_app,pdfs,pdf_avec_criteres
0,Créer un agrégateur territorial à La Rochelle ...,/aides/7ee8-creer-un-agregateur-territorial-a-...,False,url not starting with http,False,url not starting with http,[],False
1,Soutenir et expérimenter des initiatives de tr...,/aides/ec55-soutenir-et-experimenter-des-initi...,True,,True,,[],False
2,Accompagner les projets de développement local...,/aides/bbee-accompagner-les-projets-de-develop...,True,,True,,[],False
3,Réduire les émissions de polluants atmosphériq...,/aides/5949-reduire-les-emissions-de-polluants...,True,,False,url not starting with http,[],False
4,Changer le comportement des usagers grâce à la...,/aides/cd9e-changement-comportemental-des-usag...,False,url not starting with http,False,url not starting with http,[],False


In [None]:
data["error_orig"] = 1 - data["error_orig"]
data["error_app"] = 1 - data["error_app"]

In [18]:
n_aides = data["url"].nunique()
print(f"There are {n_aides} unique pages that were analysed")

There are 2637 unique pages that were analysed


In [20]:
n_errors_orig = data["error_orig"].sum()
n_errors_app = data["error_app"].sum()
n_errors = (data["error_orig"] + data["error_app"] == 2).sum()

print(f"{n_errors_orig} errors found on orig url")
print(f"{n_errors_app} errors found on app url")
print(f"{n_errors} errors found on both url")

847 errors found on orig url
2211 errors found on app url
782 errors found on both url


In [33]:
data["type_error_orig"] = data["type_error_orig"].replace("url not starting with http", "url empty or not valid")
data["type_error_app"] = data["type_error_app"].replace("url not starting with http", "url empty or not valid")

In [34]:
data["type_error_orig"].str.split(" to http", expand=True).iloc[:, 0].value_counts()

url empty or not valid    288
url changed               246
Unknown error             144
Status 404                123
Status 403                 28
Status 421                 12
Status 503                  3
Status 400                  2
Status 520                  1
Name: 0, dtype: int64

In [35]:
data["type_error_app"].str.split(" to http", expand=True).iloc[:, 0].value_counts()

url empty or not valid    1895
url changed                178
Unknown error               82
Status 404                  24
Status 403                  18
Status 421                  11
Status 400                   2
Status 503                   1
Name: 0, dtype: int64

In [45]:
n_aide_pdf_criterias = data.pdf_avec_criteres.sum()
criterias = ["conditions", "critères", "éligible", "éligibilité"]
print(f"There are {n_aide_pdf_criterias} pages with a pdf containing one of the following words : {criterias}")

There are 576 pages with a pdf containing one of the following words : ['conditions', 'critères', 'éligible', 'éligibilité']


In [49]:
from ast import literal_eval
t = "[1, 2]"
literal_eval(t)

[1, 2]

In [55]:
data.loc[data["pdf_avec_criteres"]]["pdfs"].apply(literal_eval).apply(len).value_counts()

1     302
2     114
8      91
3      17
6      15
4      12
10     10
18      6
5       3
7       3
9       2
11      1
Name: pdfs, dtype: int64

In [69]:
(data.loc[data["pdf_avec_criteres"]]["pdfs"].value_counts() == 1).sum()

341

In [61]:
for aide, pdfs in data.loc[data["pdf_avec_criteres"]].set_index("name")["pdfs"].apply(literal_eval).iteritems():
    print(aide)
    print("\n".join(pdfs))
    print()

Aide à la création et au développement d'entreprises
http://www.cc-madetmoselle.fr/images/FR-MODIF.pdf
http://www.cc-madetmoselle.fr/images/Documents/REGLEMENT-PLAN.pdf
http://www.cc-madetmoselle.fr/images/Documents/REG-2020-2023.pdf

Aide au développement numérique
http://www.cc-madetmoselle.fr/images/FR-MODIF.pdf
http://www.cc-madetmoselle.fr/images/Documents/REGLEMENT-PLAN.pdf
http://www.cc-madetmoselle.fr/images/Documents/REG-2020-2023.pdf

Financer l'investissement productif
http://ccv2m.fr/wp-content/uploads/2015/05/AET_reglement.pdf
http://ccv2m.fr/wp-content/uploads/2015/05/diagnostic_agricole_foncier.pdf
http://ccv2m.fr/wp-content/uploads/2019/12/tableau-V2M.pdf
http://ccv2m.fr/wp-content/uploads/2021/05/Reglement-interieur-CCV2M-1.pdf

Soutenir l'investissement en immobilier des entreprises dans le cadre d'une création ou extension d'activité
http://www.cc-madetmoselle.fr/images/FR-MODIF.pdf
http://www.cc-madetmoselle.fr/images/Documents/REGLEMENT-PLAN.pdf
http://www.cc-madet