# @robotodio  -  training data

---

## Libraries

In [1]:
# Python libraries
# ------------------------------------------------------------------------------
# Reading files with different formats
import json

# Data wrangling
import pandas as pd
import numpy as np

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

## Data loading

In [7]:
# Read in pandas
# ------------------------------------------------------------------------------
# File path
file_path = 'jubtc-train.csv.zip'

# Read in pandas
df = pd.read_csv(
    file=file_path,
    header=0,
    sep=',',
    usecols=['id', 'comment_text', 'target', 'severe_toxicity', 'obscene',
             'threat', 'insult', 'identity_attack', 'sexual_explicit'],
    compression='zip')

df.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,0.0
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,0.0
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,0.0
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,0.0
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0


In [10]:
df.tail()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
1804869,6333967,0.0,"Maybe the tax on ""things"" would be collected w...",0.0,0.0,0.0,0.0,0.0,0.0
1804870,6333969,0.0,What do you call people who STILL think the di...,0.0,0.0,0.0,0.0,0.0,0.0
1804871,6333982,0.0,"thank you ,,,right or wrong,,, i am following ...",0.0,0.0,0.0,0.0,0.0,0.0
1804872,6334009,0.621212,Anyone who is quoted as having the following e...,0.030303,0.030303,0.045455,0.621212,0.0,0.0
1804873,6334010,0.0,Students defined as EBD are legally just as di...,0.0,0.0,0.0,0.0,0.0,0.0


Data loaded succesfully.

## EDA - Exploratory data analysis

**Dataset shape:**

In [17]:
# Shape
print(f"The dataset has {df.shape[1]} columns, and {df.shape[0]} rows.")

The dataset has 9 columns, and 1804874 rows.


**Columns:**

We can see the name and data type of each column.

In [18]:
# Instance a variable with columns names
columns = df.columns

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1804874 entries, 0 to 1804873
Data columns (total 9 columns):
 #   Column           Dtype  
---  ------           -----  
 0   id               int64  
 1   target           float64
 2   comment_text     object 
 3   severe_toxicity  float64
 4   obscene          float64
 5   identity_attack  float64
 6   insult           float64
 7   threat           float64
 8   sexual_explicit  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 123.9+ MB


In [25]:
df.describe().round(2)

Unnamed: 0,id,target,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
count,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0
mean,3738433.79,0.1,0.0,0.01,0.02,0.08,0.01,0.01
std,2445186.76,0.2,0.02,0.06,0.08,0.18,0.05,0.05
min,59848.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,796975.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5223774.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5769854.5,0.17,0.0,0.0,0.0,0.09,0.0,0.0
max,6334010.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
def assess_NA(data):
    """
    Devuelve un dataframe de pandas indicando el numero total de NaN values y el porcentaje respecto del total para
    cada columna.
    
    El nombre de la variable es colocado en el index.
    
    Parámetros
    ----------
    data: dataframe
    """
    ## pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum() ## instantiate columns for missing data
    total = null_sum.sort_values(ascending = False)
    percent = ( ((null_sum / len(data.index)) * 100).round(2) ).sort_values(ascending=False)
    
    ## concatenate along the columns to create the complete dataframe
    df_NA = pd.concat([total, percent], axis = 1, keys = ['Number of NA', 'Percent NA'])
    
    ## drop rows that don't have any missing data; omit if you want to keep all rows
    df_NA = df_NA[(df_NA.T != 0).any()]
    
    return df_NA

df_NA = assess_NA(df)
df_NA

Unnamed: 0,Number of NA,Percent NA
