# Exploratory Data Analysis



In [40]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
df_art = pd.read_csv('articles_2017_08.csv')
df_com = pd.read_csv('comments_2017_08.csv')
# Make float better readable
pd.options.display.float_format = '{:.0f}'.format

import numpy as np

### First Look

In [44]:
df_art.head(3)

Unnamed: 0,tId,article_id,updated,num_comments,link,header,sub,text
0,30600370,30600370,1502015309,12,/leben/reisen/story/Fuenf-Gruende-fuer-eine-Re...,Fünf Gründe für eine Reise nach Alabama,Sweet Home Alabama! Der Staat im Südosten der ...,"Feinste Sandstrände, kilometerlange Maisfelder..."
1,11684042,11684042,1502015306,4,/ausland/news/story/Lopez-aus-Gefaengnis-in-Ha...,Lopez aus Gefängnis in Hausarrest entlassen,Vier Tage nach seiner Festnahme befindet sich ...,Vier Tage nach seiner Festnahme ist der venezo...
2,29297244,29297244,1502015324,39,/sport/weitere/story/29297244,Buhmann Gatlin holt Gold – Bolt wird gefeiert,Usain Bolt muss auf seine 12. WM-Goldmedaille ...,(sr/fal/sda)


In [12]:
df_com.head(3)

Unnamed: 0,tId,cId,mob,vup,vdo,tit,aut,time,con
0,30284249,329_485,1,5,2,@Rebby,Störenfried,am 06.08.2017 08:59,"Wäre doch machbar, Störsender in Autos einzub..."
1,25420673,1_37,1,0,0,@T. Paul,Papa Bär,am 05.08.2017 22:02,Hier. Aber ich bin ja nur ein Papa Bär. Hüte ...
2,30284249,47_47,1,39,7,Das liebe Handy...,Heinz,am 05.08.2017 19:00,Und dazu brauchts ne Studie? Legt im allgemei...


In [16]:
print(df_art.shape)
print(df_com.shape)

(21, 8)
(1307, 9)


In [18]:
print(df_art.dtypes)
print(' ')
print(df_com.dtypes)

tId               int64
article_id        int64
updated         float64
num_comments      int64
link             object
header           object
sub              object
text             object
dtype: object
 
tId      int64
cId     object
mob      int64
vup      int64
vdo      int64
tit     object
aut     object
time    object
con     object
dtype: object


In [45]:
df_art.describe()

# Summary:
# No missing attributes since count is the same everywhere
# No negative values, chi^2 etc. can be used

Unnamed: 0,tId,article_id,updated,num_comments
count,21,21,21,21
mean,21993259,21993259,1502015317,63
std,6324191,6324191,8,95
min,11684042,11684042,1502015306,0
25%,15904700,15904700,1502015309,6
50%,22101669,22101669,1502015316,16
75%,26624359,26624359,1502015325,39
max,30715052,30715052,1502015328,296


In [42]:
df_com.describe()

# Summary:
# No missing attributes 
# No negative values

Unnamed: 0,tId,mob,vup,vdo
count,1307,1307,1307,1307
mean,19864521,1,40,19
std,6589672,0,69,36
min,11684042,0,0,0
25%,13942829,0,8,3
50%,19423348,1,22,8
75%,25420673,1,46,18
max,30715052,1,969,316


In [23]:
print(df_art.skew())
# Makes only sense on "num_comments"

tId            -0.128714
article_id     -0.128714
updated         0.063331
num_comments    1.611588
dtype: float64

In [25]:
print(df_com.skew())
# Skew is not high enough that it has to be corrected

tId    0.440598
cId    1.001689
mob   -1.122200
vup    7.306863
vdo    3.847134
dtype: float64

### Class Distribution

In [46]:
# "df_art" has no classes
# "df_com" only has "mob" class
print(df_com.groupby('mob').size())

mob
0    334
1    973
dtype: int64


### Correlation
Let's find relations between some of the attributes. Correlation requires continuous data.

In [48]:
df_cor = df_com[['mob', 'vup', 'vdo']]
