In [1]:
!pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import os
from glob import glob

# Obtendo dados de treinamento sobre as noticias

In [3]:
# 1. Reading training data files (already read in previous cells)
# We can see that the training data has already been read and combined in df_combined

# 2. Reading news/items files
news_files = glob(os.path.join('../datasources/train/news', 'itens-*.csv'))
print(f"Found {len(news_files)} news files: {news_files}")

# Reading and combining all news files
dfs_news = []
for file in news_files:
  df_news_part = pd.read_csv(file)
  dfs_news.append(df_news_part)
  print(f"Read {file}, shape: {df_news_part.shape}")

df_news = pd.concat(dfs_news, ignore_index=True)
print(f"\nCombined news dataframe shape: {df_news.shape}")
print(df_news.head())
# Summary of dataframes
print("\nSummary of dataframes:")
print(f"News data (df_news): {df_news.shape} rows, {df_news.columns.tolist() if 'df_news' in locals() else 'not loaded'}")

Found 3 news files: ['../datasources/train/news/itens-parte2.csv', '../datasources/train/news/itens-parte3.csv', '../datasources/train/news/itens-parte1.csv']
Read ../datasources/train/news/itens-parte2.csv, shape: (100000, 7)
Read ../datasources/train/news/itens-parte3.csv, shape: (55603, 7)
Read ../datasources/train/news/itens-parte1.csv, shape: (100000, 7)

Combined news dataframe shape: (255603, 7)
                                   page  \
0  7371a9b5-5824-4c57-8704-00a74feebe79   
1  7a5ea08f-4583-49e2-ba52-a71999443f7b   
2  6afc8bbb-4f36-43d5-8a44-a2917df5621a   
3  5cc3bd27-80c7-457d-a807-2e8e7fddf031   
4  d6956177-db96-42f5-9f68-dd0d6e930661   

                                                 url  \
0  http://g1.globo.com/al/alagoas/noticia/2018/09...   
1  http://g1.globo.com/am/amazonas/noticia/detent...   
2  http://g1.globo.com/ap/amapa/noticia/audios-mo...   
3  http://g1.globo.com/ap/amapa/noticia/2020/11/0...   
4  http://g1.globo.com/ap/amapa/noticia/2019/05/2...   

In [None]:
# News data cleaning and preparation
print("Performing news data cleaning and preparation...")

# Working with a copy of the dataframe
df_news_clean = df_news.copy()

# 1. Check for null values
print(f"\nNumber of null values per column:")
print(df_news_clean.isnull().sum())

# 2. Remove duplicate rows
dup_before = df_news_clean.duplicated().sum()
df_news_clean = df_news_clean.drop_duplicates()
print(f"\nDuplicate rows removed: {dup_before}")

# 3. Extract domain from URL
df_news_clean['domain'] = df_news_clean['url'].apply(lambda x: x.split('//')[1].split('/')[0] if isinstance(x, str) else "unknown")

# 4. Convert timestamp columns to datetime
df_news_clean['issued'] = pd.to_datetime(df_news_clean['issued'])
df_news_clean['modified'] = pd.to_datetime(df_news_clean['modified'])

# 5. Calculate text lengths (features)
df_news_clean['title_length'] = df_news_clean['title'].apply(lambda x: len(x) if isinstance(x, str) else 0)
df_news_clean['body_length'] = df_news_clean['body'].apply(lambda x: len(x) if isinstance(x, str) else 0)

# 6. Extract date information (year, month) from publication date
df_news_clean['issued_year'] = df_news_clean['issued'].dt.year.astype('int32')
df_news_clean['issued_month'] = df_news_clean['issued'].dt.month.astype('int32')

# 7. Calculate time difference between publication and modification (in hours)
df_news_clean['time_diff'] = (df_news_clean['modified'] - df_news_clean['issued']).dt.total_seconds() / 3600

# 8. Text body processing (remove extra line breaks)
df_news_clean['body'] = df_news_clean['body'].str.replace('\n', ' ', regex=False)

# 9. Check and treat outliers in numerical columns
# Here we use IQR to identify outliers in the time_diff column
Q1_time = df_news_clean['time_diff'].quantile(0.25)
Q3_time = df_news_clean['time_diff'].quantile(0.75)
IQR_time = Q3_time - Q1_time
lower_bound_time = Q1_time - 1.5 * IQR_time
upper_bound_time = Q3_time + 1.5 * IQR_time

# Apply the upper limit to the time_diff column
df_news_clean['time_diff'] = df_news_clean['time_diff'].clip(lower=0, upper=upper_bound_time)

# Show information about the cleaned dataframe
print("\nDataframe information after cleaning:")
print(df_news_clean.info())

# Show summary statistics
print("\nStatistics after cleaning:")
print(df_news_clean.describe())

# Show the first rows of the cleaned dataframe
print("\nFirst rows of the cleaned dataframe:")
print(df_news_clean.head())

print(f"\nFinal dimensions of the cleaned dataframe: {df_news_clean.shape}")

print(f"News data (df_news_clean): {df_news_clean.shape} rows, {df_news_clean.columns.tolist() if 'df_news_clean' in locals() else 'not loaded'}")

Realizando limpeza e preparação dos dados das notícias...

Quantidade de valores nulos por coluna:
page        0
url         0
issued      0
modified    0
title       0
body        0
caption     0
dtype: int64

Linhas duplicadas removidas: 0

Informações do dataframe após limpeza:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255603 entries, 0 to 255602
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype              
---  ------        --------------   -----              
 0   page          255603 non-null  object             
 1   url           255603 non-null  object             
 2   issued        255603 non-null  datetime64[ns, UTC]
 3   modified      255603 non-null  datetime64[ns, UTC]
 4   title         255603 non-null  object             
 5   body          255603 non-null  object             
 6   caption       255603 non-null  object             
 7   domain        255603 non-null  object             
 8   title_length  255603 non-null  int64      

In [None]:
# Reading training data files
train_files = glob(os.path.join("../datasources/train/", "treino_*.csv"))
print(f"Found {len(train_files)} training files: {train_files}")

# Reading and combining all training files
dfs_train = []
for file in train_files:
    df_train_part = pd.read_csv(file)
    dfs_train.append(df_train_part)
    print(f"Read {file}, shape: {df_train_part.shape}")

# Combine all training dataframes
df_train = pd.concat(dfs_train, ignore_index=True)
print(f"\nCombined training dataframe shape: {df_train.shape}")
print(df_train.head())

# Check for duplicates
dup_train = df_train.duplicated().sum()
print(f"\nDuplicate rows in training data: {dup_train}")

# Basic information about the training dataframe
print("\nTraining data info:")
print(df_train.info())

# Summary statistics
print("\nTraining data summary statistics:")
print(df_train.describe())

# Check for null values
print("\nNull values in training data:")
print(df_train.isnull().sum())

print(f"Training data (df_train): {df_train.shape} rows, {df_train.columns.tolist()}")

Found 6 training files: ['../datasources/train/treino_parte3.csv', '../datasources/train/treino_parte2.csv', '../datasources/train/treino_parte1.csv', '../datasources/train/treino_parte5.csv', '../datasources/train/treino_parte4.csv', '../datasources/train/treino_parte6.csv']
Read ../datasources/train/treino_parte3.csv, shape: (100000, 10)
Read ../datasources/train/treino_parte2.csv, shape: (100000, 10)
Read ../datasources/train/treino_parte1.csv, shape: (100000, 10)
Read ../datasources/train/treino_parte5.csv, shape: (100000, 10)
Read ../datasources/train/treino_parte4.csv, shape: (100000, 10)
Read ../datasources/train/treino_parte6.csv, shape: (77942, 10)

Combined training dataframe shape: (577942, 10)
                                              userId    userType  historySize  \
0  fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...  Non-Logged            2   
1  17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...  Non-Logged            2   
2  528a8d7a2af73101da8d6709c1ec875b449a5a5

In [None]:
# df_news_clean.head()

Unnamed: 0,page,url,issued,modified,title,body,caption,domain,title_length,body_length,issued_year,issued_month,time_diff
0,7371a9b5-5824-4c57-8704-00a74feebe79,http://g1.globo.com/al/alagoas/noticia/2018/09...,2018-09-13 14:52:55+00:00,2018-09-14 16:14:49+00:00,Corpo de motorista da Uber é encontrado em can...,Corpo de motorista de aplicativo desaparecido ...,"Segundo a polícia, Antônio Vitor foi solicitad...",g1.globo.com,66,1417,2018,9,9.728056
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,http://g1.globo.com/am/amazonas/noticia/detent...,2018-05-20 20:42:40+00:00,2018-05-20 20:42:40+00:00,Detento recapturado após fuga por túnel volta ...,Detento disse que passou nome falso ao dar ent...,Ele tinha registro em presídio com nome falso....,g1.globo.com,68,2104,2018,5,0.0
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,http://g1.globo.com/ap/amapa/noticia/audios-mo...,2017-07-30 00:37:17+00:00,2017-07-30 00:48:42+00:00,Áudios mostram conversa entre bandidos durante...,Áudios mostram possível conversa entre bandido...,Revista realizada na sexta-feira (28) no Iapen...,g1.globo.com,85,2328,2017,7,0.190278
3,5cc3bd27-80c7-457d-a807-2e8e7fddf031,http://g1.globo.com/ap/amapa/noticia/2020/11/0...,2020-11-06 12:54:00+00:00,2020-11-12 21:22:52+00:00,FOTOS: Apagão no Amapá,"Moradores da capital do Amapá, em Macapá, faze...",Incêndio em subestação de energia deixa 13 dos...,g1.globo.com,23,3258,2020,11,9.728056
4,d6956177-db96-42f5-9f68-dd0d6e930661,http://g1.globo.com/ap/amapa/noticia/2019/05/2...,2019-05-27 13:43:03+00:00,2019-05-27 18:19:06+00:00,Profissionais da educação no AP paralisam ativ...,Profissionais da educação paralisam atividades...,Ato comprometeu aulas em escolas nesta segunda...,g1.globo.com,85,3841,2019,5,4.600833


In [11]:
df_train.head()

Unnamed: 0,userId,userType,historySize,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,timestampHistory_new
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,Non-Logged,2,"80aa7bb2-adce-4a55-9711-912c407927a1, d9e5f15d...","1657908085200, 1659634203762","0, 0","71998, 115232","81.58, 73.36","1, 1","1657908085200, 1659634203762"
1,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,Non-Logged,2,"19ba89fc-1e06-4c5d-9c57-4a3088dc0511, e273dba4...","1657111508570, 1657481309920","68, 12","131495, 43733","51.74, 35.49","1, 1","1657111508570, 1657481309920"
2,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,Non-Logged,2,"59a61a8a-cc52-453f-b1cd-2bd019e9d574, a0562805...","1657823890328, 1660141444328","55, 9","159042, 10336","62.19, 48.28","1, 1","1657823890328, 1660141444328"
3,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,Non-Logged,2,"233f8238-2ce0-470f-a9d5-0e0ac530382a, 037155f4...","1656963373076, 1657091888917","0, 0","193579, 20519","31.03, 31.9","1, 1","1656963373076, 1657091888917"
4,97e1439d485b0630e12818d3df84ff67d08475ef6ebeb0...,Logged,2,"385044ad-3876-4188-83fa-f560435c1a9c, 2f754502...","1657618607633, 1659536839832","57, 38","220000, 130000","52.65, 53.37","1, 1","1657618607633, 1659536839832"


## Obtendo dados de validação

In [20]:
# 3. Reading validation data
validation_path = "../datasources/validacao.csv"
if os.path.exists(validation_path):
    df_validation = pd.read_csv(validation_path)
    print(f"\nValidation dataframe shape: {df_validation.shape}")
    print(df_validation.head())
else:
    print(f"\nValidation file not found at {validation_path}")

print(
    f"Validation data (df_validation): {df_validation.shape if 'df_validation' in locals() else 'not loaded'} rows"
)

print(
    f"News data (df_validation): {df_validation.shape} rows, {df_validation.columns.tolist() if 'df_validation' in locals() else 'not loaded'}"
)


Validation dataframe shape: (112184, 4)
                                              userId userType  \
0  e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...   Logged   
1  d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...   Logged   
2  755062dd39a48809880cf363b04268c3af2c003088cde0...   Logged   
3  ec1639851d99586c7f4da928deb49187303aec6e3b8d66...   Logged   
4  a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...   Logged   

                                             history  \
0  ['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'\n '01c...   
1           ['77901133-aee7-4f7b-afc0-652231d76fe9']   
2           ['857aa90f-a7ec-410d-ba82-dfa4f85d4e71']   
3  ['b7b90e18-7613-4ca0-a8fc-fd69addfcd85'\n '835...   
4  ['9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6'\n 'b8e...   

                                    timestampHistory  
0                      [1660533136590 1660672113513]  
1                                    [1660556860253]  
2                                    [1660561649242]  
3  [1660533

In [None]:
# Data cleaning for validation dataset
print("Cleaning validation data...")

# Create a copy of the validation dataframe
df_validation_clean = df_validation.copy()

# 1. Check for null values
print("\nNull values in validation data:")
print(df_validation_clean.isnull().sum())

# 2. Check for duplicates
dup_before_val = df_validation_clean.duplicated().sum()
df_validation_clean = df_validation_clean.drop_duplicates()
print(f"Removed {dup_before_val} duplicate rows")

# 3. Clean the history column - it appears to be stored as string representations of lists
df_validation_clean["history"] = (
    df_validation_clean["history"]
    .str.replace("'", "")
    .str.replace("[", "")
    .str.replace("]", "")
    .str.replace("\n", "")
    .str.strip()
)
df_validation_clean["history"] = df_validation_clean["history"].str.split(",")

# 4. Clean the timestamp column - also appears to be stored as string representations of lists
df_validation_clean["timestampHistory"] = (
    df_validation_clean["timestampHistory"]
    .str.replace("[", "")
    .str.replace("]", "")
    .str.strip()
)
df_validation_clean["timestampHistory"] = df_validation_clean[
    "timestampHistory"
].str.split()

# 5. Calculate history size (number of items in history)
df_validation_clean["historySize"] = df_validation_clean["history"].apply(len)

# 6. Add user type distribution
user_type_counts = df_validation_clean["userType"].value_counts()
print("\nUser type distribution:")
print(user_type_counts)
print(
    f"Percentage of Logged users: {user_type_counts['Logged'] / len(df_validation_clean) * 100:.2f}%"
)


# 7. Convert timestamp strings to integers and calculate some time-based features
def process_timestamps(timestamps):
    if isinstance(timestamps, list):
        return [int(ts) for ts in timestamps if ts.isdigit()]
    return []


df_validation_clean["timestampHistory"] = df_validation_clean["timestampHistory"].apply(
    process_timestamps
)


# 8. Calculate time span between first and last interaction (in hours)
def calc_timespan(timestamps):
    if len(timestamps) > 1:
        return (timestamps[-1] - timestamps[0]) / (
            1000 * 60 * 60
        )  # milliseconds to hours
    return 0


df_validation_clean["timespan_hours"] = df_validation_clean["timestampHistory"].apply(
    calc_timespan
)

# Display the cleaned data
print("\nCleaned validation data sample:")
print(df_validation_clean.head())
print(f"\nCleaned validation data shape: {df_validation_clean.shape}")
print(df_validation_clean.info())