In [1]:
from datetime import datetime
import pandas as pd
import json
import requests
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [15, 12]

In [2]:
import networkx as nx

from networkx.readwrite import json_graph
from networkx.algorithms import community

In [3]:
df = pd.read_csv('2008.csv')[['Month', 'DayofMonth', 
                              'Year', 'ArrTime', 'DepTime',
                              'Origin', 'Dest', 
                              'Cancelled', 'FlightNum']]

In [4]:
df['Date'] = df['Year'].astype(str) +"-"+ df['Month'].astype(str) +"-"+ df['DayofMonth'].astype(str)

In [5]:
df.head()

Unnamed: 0,Month,DayofMonth,Year,ArrTime,DepTime,Origin,Dest,Cancelled,FlightNum,Date
0,1,3,2008,2211.0,2003.0,IAD,TPA,0,335,2008-1-3
1,1,3,2008,1002.0,754.0,IAD,TPA,0,3231,2008-1-3
2,1,3,2008,804.0,628.0,IND,BWI,0,448,2008-1-3
3,1,3,2008,1054.0,926.0,IND,BWI,0,1746,2008-1-3
4,1,3,2008,1959.0,1829.0,IND,BWI,0,3920,2008-1-3


In [6]:
df = df.loc[df['Cancelled'] != 1]
df = df.drop(['Month', 'DayofMonth', 'Year', 'Cancelled'], axis=1)

In [7]:
df = df.dropna()

In [8]:
def col_to_datetime(df, col):
    df[col] = df[col].astype(int)
    df[col] = df[col].astype(str)
    df[col] = ('0'*3) + df[col]
    df[col] = df[col].str.replace('.*(?=....$)', '', regex=True) # оставим только последние 4 символа 
    df[col] = df[col].str.replace('(?<=^..)(?!.$)', ':', regex=True)
    df[col] = df[col].str.replace('^24', '00', regex=True)
    df[col] = df[col] + ":00"
    df[col] = df['Date'] + " " + df[col]
    df[col] = df[col].astype('datetime64[ns]')

In [9]:
col_to_datetime(df, 'ArrTime')
col_to_datetime(df, 'DepTime')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6858079 entries, 0 to 7009727
Data columns (total 6 columns):
 #   Column     Dtype         
---  ------     -----         
 0   ArrTime    datetime64[ns]
 1   DepTime    datetime64[ns]
 2   Origin     object        
 3   Dest       object        
 4   FlightNum  int64         
 5   Date       object        
dtypes: datetime64[ns](2), int64(1), object(3)
memory usage: 366.3+ MB


In [11]:
df.head()

Unnamed: 0,ArrTime,DepTime,Origin,Dest,FlightNum,Date
0,2008-01-03 22:11:00,2008-01-03 20:03:00,IAD,TPA,335,2008-1-3
1,2008-01-03 10:02:00,2008-01-03 07:54:00,IAD,TPA,3231,2008-1-3
2,2008-01-03 08:04:00,2008-01-03 06:28:00,IND,BWI,448,2008-1-3
3,2008-01-03 10:54:00,2008-01-03 09:26:00,IND,BWI,1746,2008-1-3
4,2008-01-03 19:59:00,2008-01-03 18:29:00,IND,BWI,3920,2008-1-3


In [12]:
df['Flight_Index'] = df.index
arriving_flights = df[['ArrTime','Origin', 'Dest', 'Flight_Index']]
departing_flights = df[['DepTime', 'Origin', 'Dest', 'Flight_Index']]

arriving_flights['ActionTime'] = arriving_flights['ArrTime']
departing_flights['ActionTime'] = departing_flights['DepTime']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  arriving_flights['ActionTime'] = arriving_flights['ArrTime']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  departing_flights['ActionTime'] = departing_flights['DepTime']


In [13]:
# взлёт и позадка - это разные события, на этой оснвое и построим симуляцию
all_flights = pd.concat([arriving_flights, departing_flights], ignore_index=True)

In [14]:
all_flights = all_flights.sort_values(['ActionTime'])
all_flights.loc[all_flights['ArrTime'].isna(), 'ActionType'] = 'Dep'
all_flights.loc[all_flights['DepTime'].isna(), 'ActionType'] = 'Arr'
all_flights = all_flights.drop(['ArrTime', 'DepTime'], axis=1)
all_flights

Unnamed: 0,Origin,Dest,Flight_Index,ActionTime,ActionType
7272623,LAX,DTW,428607,2008-01-01 00:00:00,Dep
236012,SFO,SAN,243790,2008-01-01 00:00:00,Arr
287289,HOU,HRL,296192,2008-01-01 00:00:00,Arr
387490,MIA,CLT,399744,2008-01-01 00:00:00,Arr
249085,PHX,STL,257285,2008-01-01 00:00:00,Arr
...,...,...,...,...,...
6614819,ATL,PIT,6758018,2008-12-31 23:58:00,Arr
6644826,DEN,BOI,6788915,2008-12-31 23:58:00,Arr
6720544,MSP,SLC,6867744,2008-12-31 23:59:00,Arr
13380289,HNL,SFO,6663085,2008-12-31 23:59:00,Dep


In [41]:
import random
from tqdm import tqdm

# установим порог, после которого симуляция останавливается
def start_infection(airport, prob, df, threshold=95):
    flight_to_skip = []
    infected_flight = []
    
    airports = pd.Series(pd.concat([df['Dest'], df['Origin']]).unique())
    spread_infection = pd.DataFrame(airports, columns=['airport'])
    spread_infection['infection'] = 0
    spread_infection.loc[spread_infection['airport'] == airport, 'infection'] = 1
    
    for i in tqdm(df.itertuples(), total=len(df)):
        # сразу пропустить если рейс из здорового аэропорта
        if i[3] in flight_to_skip:
            flight_to_skip.remove(i[3])
            continue
            
        airport_of_origin = i[1]
        airport_of_dest = i[2]
        
        #Заражён ли аэропорт прибытия/отбытия
        infected_or_not_orig = bool(spread_infection.loc[spread_infection['airport'] == airport_of_origin, 'infection'].all())
        infected_or_not_dest = bool(spread_infection.loc[spread_infection['airport'] == airport_of_dest, 'infection'].all())
        
        # Если рейс прибывающий - ...
        if i[5] == 'Arr':
            # пропустить, если аэропорт прибытия уже заражён
            if infected_or_not_dest:
                continue
                
        # Если рейс отлетает 
        if i[5] == 'Dep':
            # и самолёт здоровый
            if not infected_or_not_orig:
                # - добавить в список на пропуск данный рейс
                flight_to_skip.append(i[3])
            else:
                # - добавить в список зараженный рейс
                infected_flight.append(i[3])
        
        # Если рейс прибывающий - ...
        if i[5] == 'Arr': 
            # и если рейс зараженный, попытаться заразить
            if i[3] in infected_flight:
                infected_flight.remove(i[3])
                if random.random() < prob:
                    spread_infection.loc[spread_infection['airport'] == airport_of_dest, 'infection'] = 1
                    spread_infection.loc[spread_infection['airport'] == airport_of_dest, 'infection_date'] = i[4]
        
        # Доля зараженых аэропортов
        prop_infected = (spread_infection['infection'].value_counts(normalize=True) * 100)[1]
        
        if (spread_infection['infection'] == 1).all():
            print('Evil is WON!!!')
            return spread_infection
        
        # Проведя итерации со стопроцентым шансом заражения, ВСЕ аэропорты "заражались" только под конец датасета,
        # поэтому остановим симуляцию когда заразится 95 процентов(этот параметр можно менять) от всех аэропортов.
        if np.round(prop_infected, 2) >= threshold: 
            print(f'{np.round(prop_infected)}% is infected!')
            return spread_infection
        

In [None]:
%%time
results = start_infection('FAR', 1, all_flights)
results

  6%|████                                                                | 826440/13716158 [09:53<2:31:23, 1419.00it/s]