#### CLEAN DATASETS FROM CARTO-SONUM

09/03/2019 - Julien Paris

for each dataset : 
- read CSV/EXCEL
- get unique values from all columns
- retrieve unique values on specific columns
- export original data as CSV and XLS files
- export unique values as a JSON file

cf : https://github.com/co-demos/carto-sonum/tree/master/data

In [9]:
# -*- encoding: utf-8 -*-
import json
import pandas as pd
import numpy as np

In [10]:
class CSVtoXLS :
    
    def __init__(self, name, code, ext="csv",sep=";", encoding="utf-8") : 
        self.filename = name
        self.code = code
        self.ext = ext
        self.file = name + "." + ext
        self.sep = sep
        self.encoding = encoding
        self.df = None
        self.columnsNames = []
        self.columnsUniques = []

    def toDF(self):
        if self.ext == "csv":
            df = pd.read_csv(self.file, sep=self.sep, encoding=self.encoding)
        else : 
            df = pd.read_excel(self.file, encoding=self.encoding)
        df = df.dropna(how="all")
        df = df.replace({np.nan:None}) 
        self.df = df
        return df
    
    def getUniques(self):
        results=[]
        for col in self.df:
            self.columnsNames.append(col)
            uniques = self.df[col].unique()
            u_dict = {"colname" : col, "uniques" : uniques.tolist() }
            u_dict["u_count"] = len(u_dict["uniques"])
            results.append(u_dict)
        self.columnsUniques = results
        return results
            
    def toXLS(self):
        #writer = pd.ExcelWriter(self.code + '.xlsx')
        #self.df.to_excel(writer, index = False)
        #writer.save()
        filename = "-" + self.code + '.xlsx'
        self.df.to_excel(filename, encoding="utf-8")
    
    def toTSV(self):
        filename = "-" + self.code + '.csv'
        self.df.to_csv(filename, sep='\t', encoding='utf-8')


In [20]:
CSVfiles = [ 

    ### DRJSCS
    {"name" : "export_base_ressources_territoriales_20181227_DRJSCS_HDF", 
     "ext" : "csv",
     "encoding" : "ISO-8859-1",
     "sep" : ";",
     "colToGetUniques" : [
         u"Enjeu", 
         u"Dispositif", 
         u"Public_cible", 
         u"Statut",
         u"Echelle",
         u"Conditions_acces",
         u"Horaires"
     ],
     "code" : "DRJSCS"
    },
    
    ### MSAP
    {"name" : "MSAP-20180627", 
     "ext" : "csv",
     "encoding" : "ISO-8859-1",
     "sep" : ";",
     "colToGetUniques" : [
         u"Horaires d'ouverture",
         u"Accessibilité",
         u"Equipements à disposition"
     ],
     "code" : "MSAP"
    },
    
    ### LOIRE ATLANTIQUE
    {"name" : "224400028_lieux-numeriques-en-loire-atlantique", 
     "ext" : "csv",
     "encoding" : "utf-8",
     "sep" : ";",
     "colToGetUniques" : [
         u"Type de porteur", 
         u"Public",
         u"Tarif(s)"
     ],
     "code" : "Loire-Atlantique"
    },
    
    ### GIRONDE
    {"name" : "listeEPN-NETPUBLIC-Gironde", 
     "ext" : "csv",
     "encoding" : "utf-8",
     "sep" : ",",
     "colToGetUniques" : [
         u"Label",
         u"Services proposés",
         u"Tarif"
     ],
     "code" : "Gironde"
    },
    
    ### NET PUBLIC 
    {"name" : "Copie de Annuaire EPN Netpublic - decembre 2016", 
     "ext" : "csv",
     "encoding" : "ISO-8859-1",
     "sep" : ";",
     "colToGetUniques" : [
         u"Statut",
     ],
     "code" : "NetPublic"
    },
]

In [21]:
CSVlist = [CSVfiles[4]]
#CSVlist = CSVfiles

In [22]:

CSV_dfList = []
for csv in CSVlist:
    classCSV = CSVtoXLS(csv["name"],csv["code"],ext=csv["ext"],sep=csv["sep"],encoding=csv["encoding"] )
    classCSV.toDF()
    classCSV.getUniques()
    result = { 
        "code" : csv["code"], 
        "df" : classCSV.df, 
        "columnsNames" : classCSV.columnsNames,
        "columnsUniques" : classCSV.columnsUniques,
        "classCSV" : classCSV, 
        "count" : {"rows" : classCSV.df.shape[0],"columns" : classCSV.df.shape[1] },
        "csvInfos" : csv,
        "colUniquesToKeep" : [ c for c in classCSV.columnsUniques if c["colname"] in csv["colToGetUniques"]]
    }
    CSV_dfList.append(result)

  if self.run_code(code, result):


In [23]:
for csv in CSV_dfList:
    print "- code : %s" %(csv["code"])
    print "- count : %s" %(csv["count"])
    print "- columnsNames : " 
    #print csv["columnsNames"]
    for c in csv["columnsNames"] : 
        print c
    print
    col = csv["columnsUniques"]
    for c in col : 
        if c in csv["colUniquesToKeep"]: 
            print "- colname : %s" %(c["colname"])
            print "- uniques[0:3] : %s..." %(c["uniques"][0:3])
            print "- u_count : %s" %(c["u_count"])
            print "- "*5
    print "=== "*10

- code : NetPublic
- count : {'rows': 4847, 'columns': 132}
- columnsNames : 
Identifiant
Date de dernire modification
Nom
Statut
Labels territoriaux
Label NetPublic
Mise en oeuvre du dispositif "2000 Emplois d'Avenir en EPN"
Demande de labellisation NetPublic
Site Web
N¡ et libell de voie
N¡ et libell de voie.1
Code Insee commune
Code postal
Localit
Tlphone
Email
Horaires d'ouverture
Latitude
Longitude
Prcision
Personne  contacter
Services proposs
quipement
Tarif
Nombre d'animateurs
Entre - Accs [Accessibilit]
Entre - En cas de prsence d'une marche, prciser sa hauteur en cm [Accessibilit]
Entre - Ouverture de porte [Accessibilit]
Entre - Largeur de porte en cm [Accessibilit]
Entre - Obstacle  la porte d'entre [Accessibilit]
Entre - Accs contrl [Accessibilit]
Circulation - Circulation dans les alles [Accessibilit]
Circulation - Accs en fauteuil dans tout le lieu [Accessibilit]
Texte court
Extrieur - Places GIC/GIR  proximit [Accessibilit]
Extrieur

In [25]:
CSV_dfList[0]["df"].head(3)

Unnamed: 0,Identifiant,Date de dernire modification,Nom,Statut,Labels territoriaux,Label NetPublic,"Mise en oeuvre du dispositif ""2000 Emplois d'Avenir en EPN""",Demande de labellisation NetPublic,Site Web,N¡ et libell de voie,...,Accueil de seniors,Organisation d'vnements,Image,Centre d'Animation des Portes Ferres,Tlphone direct,REPAIR CAFE,EPN,ORDI2.0,Coworking,Mission
0,54ec6c76de77fe0314ee3a1d,24/02/2015,Association Lalouma,Structure associative,PassNumrique (Rhne Alpes),1,0.0,0,http://www.lalouma.org,78 monte de la Grande Cte,...,,,,,,,,,,
1,4f340209de77fe55ff000e7b,14/02/2012,Espace Cyber-base EPM de Lavaur,,,1,,0,http://www.cyber-base.org/cyberbase/afficheInf...,275 rue de Cocagne,...,,,,,,,,,,
2,4f340209de77fe55ff000e7a,14/02/2012,APER-CTL,,Cyber-centre (Nord-Pas-de-Calais),1,,0,,Ruelle de Prseau,...,,,,,,,,,,


In [26]:
for csvClass in CSV_dfList:
    csvClass["classCSV"].toXLS()
    csvClass["classCSV"].toTSV()
    
    ### dump json only for uniques
    with open( "-" + csvClass["code"] + '.json', 'w') as fp:
        dict_to_dump = {
            k:v for k,v in csvClass.iteritems() if k in [
                "code", "columnsNames", "count", "csvInfos", "colUniquesToKeep"
            ]
        }
        json.dump(dict_to_dump, fp)

Accs%20%20l'espace%20multimdia%20libre%20sur%20rservation%20sur%20place%20ou%20par%20tlphone

Mardi%20:%2016h00%20-%2019h00
Mercredi%20:%2010h00%20-%2012h30%20%20//%2014h00%20-%2018h00
Jeudi%20:%20Fermeture
Vendredi%20:%20Fermeture
Samedi%20:%2010h00%20-%2012h30%20//%2014h00%20-%2018h00' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))

Les%20animateurs%20du%20Mobile%20se%20dplacent%20dans%20un%20vhicule%20facilement%20identifiable%20!%20Mais%20attention,%20contrairement%20%20certaines%20ides%20reues,%20ce%20ne%20sont%20pas%20les%20participants%20qui%20montent%20dans%20le%20vhicule%20mais%20bien%20le%20matriel%20qui%20en%20sort%20pour%20tre%20install%20dans%20une%20salle%20mise%20%20disposition%20par%20le%20lieux%20d'accueil%20(communes,%20communaut%20de%20communes,%20associations,%20cole...)' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))
