In [2]:
import datetime as dt 
import pandas as pd
import numpy as np
import requests
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import urllib.parse

In [3]:
link = 'https://opendata.paris.fr/explore/dataset/lieux-de-tournage-a-paris/download/?format=csv&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B'

df_tournage = pd.read_csv(link, delimiter=';', on_bad_lines='skip')

In [4]:
# Drop unused :
df_tournage = df_tournage.drop(columns=["Identifiant du lieu","Localisation de la scène","Année du tournage","Coordonnée en X", "Coordonnée en Y","geo_shape"])

# Corrections Type :
df_tournage["Code postal"] = pd.to_numeric(df_tournage["Code postal"])
df_tournage = df_tournage.dropna(subset = ['Code postal'])
df_tournage["Code postal"] = df_tournage["Code postal"].astype(int)
df_tournage["Date de début"] = pd.to_datetime(df_tournage["Date de début"])
df_tournage["Date de fin"] = pd.to_datetime(df_tournage["Date de fin"])
df_tournage["latitude"] = pd.to_numeric( [ x.split(",")[0] for x in df_tournage["geo_point_2d"][:][:] ] )
df_tournage["longitude"] = pd.to_numeric( [ x.split(",")[1] for x in df_tournage["geo_point_2d"][:][:] ] )
df_tournage = df_tournage.drop(columns="geo_point_2d")

# Clear Name Columns :
df_tournage = df_tournage.rename(columns = {"Type de tournage":"type", "Code postal":"postal", "Date de début":"debut", "Date de fin":"fin"})
df_tournage.columns = [ x.lower() for x in df_tournage.columns.tolist() ] # ABC -> abc

# keep only films :
df_tournage = df_tournage[df_tournage["type"] == 'Long métrage']
df_tournage = df_tournage.drop(columns= ['type'])

df_tournage = df_tournage.reset_index().drop(columns=['index'])

df_tournage.head()

Unnamed: 0,titre,réalisateur,producteur,postal,debut,fin,latitude,longitude
0,TOUT S'EST BIEN PASSE,Francois OZON,MANDARIN PRODUCTION,75013,2020-08-20,2020-08-21,48.83566,2.348315
1,Une jeune fille qui va bien,Sandrine Kiberlain,CURIOSA FILMS,75004,2020-08-31,2020-09-01,48.854533,2.361694
2,French Exit,Azazel Jacobs,Same Player,75012,2019-12-04,2019-12-04,48.850067,2.376519
3,FIN DE MATINEE,Hiroshi NISHATANI,COMME DES CINEMAS,75004,2018-11-05,2018-11-05,48.854112,2.354679
4,HORS NORMES,Eric Toledano et Olivier Nakache,ADNP QUAD FILMS,75001,2018-11-05,2018-11-06,48.865744,2.327446


In [5]:
url_base = 'https://www.imdb.com/search/title/?title='
param2 = '&release_date=2015-01-01,2022-12-31'

df_tournage['imdb_search'] = ''
# https://www.imdb.com/search/title/?title=TOUT+S%27EST+BIEN+PASSE&release_date=2015-01-01,2022-12-31

for i in range( len(df_tournage) ):

    title = urllib.parse.quote(df_tournage.iloc[i]['titre']).replace('%20', '+')

    df_tournage.imdb_search[i] = url_base + title + param2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tournage.imdb_search[i] = url_base + title + param2


In [6]:
df_tournage.head()

Unnamed: 0,titre,réalisateur,producteur,postal,debut,fin,latitude,longitude,imdb_search
0,TOUT S'EST BIEN PASSE,Francois OZON,MANDARIN PRODUCTION,75013,2020-08-20,2020-08-21,48.83566,2.348315,https://www.imdb.com/search/title/?title=TOUT+...
1,Une jeune fille qui va bien,Sandrine Kiberlain,CURIOSA FILMS,75004,2020-08-31,2020-09-01,48.854533,2.361694,https://www.imdb.com/search/title/?title=Une+j...
2,French Exit,Azazel Jacobs,Same Player,75012,2019-12-04,2019-12-04,48.850067,2.376519,https://www.imdb.com/search/title/?title=Frenc...
3,FIN DE MATINEE,Hiroshi NISHATANI,COMME DES CINEMAS,75004,2018-11-05,2018-11-05,48.854112,2.354679,https://www.imdb.com/search/title/?title=FIN+D...
4,HORS NORMES,Eric Toledano et Olivier Nakache,ADNP QUAD FILMS,75001,2018-11-05,2018-11-06,48.865744,2.327446,https://www.imdb.com/search/title/?title=HORS+...


In [7]:
df_tournage.to_csv(r'src/df_tournage_with_URL.csv', index = False, header=True)