In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
data = pd.read_csv(r'C:\Users\manai\Desktop\NextRay\ds_mulakat.csv')

In [None]:
#Filtering Dataset by columns to focus our analysis
df = data[['source_address','destination_address','event_time','login_fail_count','bidirectional_bytes','src2dst_bytes','dst2src_bytes','sourceGeoPoint','destinationGeoPoint']]

In [None]:
df1 = df.copy()

In [None]:
df.shape

In [None]:
df.iloc[:,:100].info()

In [None]:
df.iloc[:,100:].info()

In [None]:
#Convert event_time into datetime format
df['event_time'] = pd.to_datetime(df['event_time'])

In [None]:
df['event_time'].agg(['min','max'])

In [None]:
#Create a new column for grouping by narrow time unit
df['minutes'] = df.event_time.dt.minute

In [None]:
#Rename direction name in turkish
df['direction'] = df['direction'].map({'INTERNAL2INTERNAL':'İÇTEN İÇE', 'INTERNAL2EXTERNAL':'İÇTEN DIŞA','EXTERNAL2INTERNAL':'DIŞTAN İÇE'})

In [None]:
#Direction segments of the network traffic

plt.figure(figsize = (20,12))
ax = sns.barplot(x = df['direction'].value_counts().index, y = df['direction'].value_counts().values, ci = None)
ax.bar_label(ax.containers[0], fontsize = 18)

plt.xlabel('Ağ Trafiğinin Yönü',fontsize = 32, fontweight = 'bold')
plt.xticks(fontsize =18,fontweight = 'bold')
plt.ylabel('Ağda Gerçkleşen Aktivite Saysıs',fontsize = 24, fontweight = 'bold')
plt.yticks(fontsize = 12)
ax.set_yticks(range(0,150000,10000))
plt.title('Veri Ağındaki Trafik Yoğunluğunun Yönü',fontsize = 32, fontweight = 'bold')
plt.legend(loc = 1)
plt.show()

In [None]:
#Histogram Chart of network traffic by minutes

plt.figure(figsize = (12,8))
ax = sns.barplot(x = df['minutes'].value_counts().index, y = df['minutes'].value_counts().values,color = '#20c073')
ax.bar_label(ax.containers[0],fontsize = 15)
plt.xticks(rotation = 90,fontsize = 18)
plt.yticks(fontsize = 18)
plt.ylabel('Aktivite Sayısı',fontsize=24,fontweight='bold')
plt.xlabel('Dakika',fontsize = 24,fontweight='bold')
plt.title('Dakika Başına Network Aktivitesi',fontsize = 24,fontweight='bold')
plt.show()

In [None]:
#Login Failed Source Adresses
df[df['login_fail_count']==1]['source_address'].value_counts()

In [None]:
#Login Failed Destination Addreses
df[df['login_fail_count']==1]['destination_address'].value_counts()

In [None]:
#Login failed both source and destination addreses
df[['source_address','destination_address']][df['login_fail_count']==1].value_counts()

In [None]:
#filtering login fails and grouping by source_address
df[df['login_fail_count']==1][['source_address','minutes']].value_counts().reset_index(name = 'counts').sort_values(by=['counts','minutes'],ascending = False)

In [None]:
#filtering login fails and grouping by destination_address
df[df['login_fail_count']==1][['destination_address','minutes']].value_counts().reset_index(name = 'counts').sort_values(by=['counts','minutes'],ascending = False)


In [None]:
#Filtering login fails and grouping by source, destination addreses and minutes 
df[df['login_fail_count']==1][['source_address','destination_address','minutes']].value_counts().reset_index(name = 'counts').sort_values(by=['counts','minutes'],ascending = False)

In [None]:
#grouping data by source address and minutes
groupby_sourcemin = df.groupby(['source_address','minutes'])['minutes'].count().sort_values(ascending =False)

In [None]:
#grouping data by destination address and minutes
groupby_destmin = df.groupby(['destination_address','minutes'])['minutes'].count().sort_values(ascending =False)

In [None]:
#grouping data source, destination addresses and minutes
groupby_sourcedest = df.groupby(['source_address','destination_address','minutes'])['minutes'].count().sort_values(ascending =False)


In [None]:
#reseting indexes and cretaes a new dataframes with groups' counts
df_reset_sourcemin = groupby_sourcemin.reset_index(name = 'values')
df_reset_destmin = groupby_destmin.reset_index(name = 'values')
df_reset_sourcedest = groupby_sourcedest.reset_index(name = 'values') 

In [None]:
#Interactive Source address  Bubble chart

fig = px.scatter(df_reset_sourcemin[df_reset_sourcemin['values']>400], 
                 x='minutes', 
                 y='values', 
                 size='values', 
                 color='source_address',
                 hover_name='source_address',  
                 title=('400 den Fazla Aktivitesi Olan Kaynak Adreslerin - Etkileşimli Balon Grafigi'),
                 labels={'minutes':'Minutes', 'values':'Values'},
                 size_max=25)  


fig.show()

In [None]:
#Interactive destination Address chart

fig = px.scatter(df_reset_destmin[df_reset_destmin['values']>400], 
                 x='minutes', 
                 y='values', 
                 size='values', 
                 color='destination_address',
                 hover_name='destination_address',  # Add hover information
                 title=('400 den Fazla Aktivitesi Olan Hedef Adreslerin - Etkileşimli Balon Grafigi'),
                 labels={'minutes':'Minutes', 'values':'Values'},
                 size_max=25)  # Maximum size of bubbles
# Show the plot

fig.show()


In [None]:
#Interactive source2destination chart


plt.figure(figsize = (30,16))
df_reset_sourcedest['source_dest_combo'] = df_reset_sourcedest['source_address'] + ' -->>>> ' + df_reset_sourcedest['destination_address']


fig = px.scatter(
    df_reset_sourcedest[df_reset_sourcedest['values']>150],
    x='minutes',
    y='values',
    size='values',  
    color='source_dest_combo', 
    hover_name='source_dest_combo',  
    title='Kaynaktan-->> Hedefe Trafik Yoğunluğu - Etkileşimli Balon grafiği ',
    labels={'source_dest_combo': 'Source -> Destination'},
)


fig.show()

In [None]:
#Convert bytes to GB
df['src2dst_bytes'] = round(df['src2dst_bytes']/(1024*1024),2)
df['dst2src_bytes'] = round(df['dst2src_bytes']/(1024*1024),2)
df['bidirectional_bytes'] = round(df['bidirectional_bytes'],4)

In [None]:

#Rename column with GB
df.rename({'bidirectional_bytes':'bidirectional_GBytes','src2dst_bytes':'src2dst_GBytes','dst2src_bytes':'dst2src_GBytes'},axis  =1 , inplace =True)

In [None]:
#Create new dataframes for data exfiltration might be useful Exfiltration and ARP scan analysis
df_transfer = df[['source_address',"destination_address",'src2dst_GBytes','dst2src_GBytes','minutes']].sort_values(by = 'src2dst_GBytes', ascending = False )


In [None]:
#groupby dataframe above with source and dsetination adress
df_transfer.groupby(['source_address','minutes'])['src2dst_GBytes'].sum().sort_values(ascending= False)[:10].reset_index(name = 'Upload GB')
df_transfer.groupby(['destination_address','minutes'])['src2dst_GBytes'].sum().sort_values(ascending= False)[:10].reset_index(name = 'Download GB')

In [None]:
#Data transfer into the destination addresses
df_transfer.groupby(['source_address','destination_address','minutes'])['src2dst_GBytes'].sum().sort_values(ascending= False)[:10].reset_index(name = 'Download GB')

In [None]:
#Data transfer into the source addreses
df_transfer.groupby(['source_address','destination_address','minutes'])['dst2src_GBytes'].sum().sort_values(ascending= False)[:20].reset_index(name = 'Upload GB')

In [None]:
#Total data transfer tothe  both sides
df_transfer.groupby(['source_address','destination_address','minutes'])['bidirectional_GBytes'].sum().sort_values(ascending= False)[:20].reset_index(name = 'Total Transfer GB')