# Inserindo dados no MongoDB

In [1]:
from pymongo import MongoClient #Conexão com MongoDB
import pprint #Mostrar informações dos dados do banco de dados
import pandas as pd #Leitura dos arquivos
from pymongo import timeout
import numpy as np

## Criando o cliente do MongoDB

In [2]:
client = MongoClient('localhost', 27017)

In [3]:
client

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [4]:
#Nome do banco de dados
nome_bd = 'Ecommerce_itens' 

In [5]:
db_itens = client[nome_bd]

In [6]:
db_itens

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'Ecommerce_itens')

## Lendo os arquivos .csv

In [7]:
#Pega o nome dos arquivos e remove o .csv deles
import os

mylist = os.listdir("datasets")
dic_files = {}
for file in mylist:
    aux = file.split(".")[0]
    dic_files[aux] = pd.read_csv(f"datasets/{file}")

In [29]:
# Total de datasets
len(dic_files)

139

## Removendo arquivos vazios

In [8]:
# Verificando se todos os datasets tem mais de 0 instâncias

final_dic = dic_files.copy()
for i in dic_files:

    if dic_files[i].shape[0] == 0:
        del final_dic[i]

In [28]:
# Total de datasets
len(final_dic)

113

## Visualizando informações dos conjuntos

In [9]:
# Cabeçalho de um conjunto de dados
final_dic['Running'].head()

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price
0,Amazon Brand - Symactive Men's Regular Fit T-S...,sports & fitness,Running,https://m.media-amazon.com/images/I/7148IlzWBV...,https://www.amazon.in/Amazon-Brand-Symbol-T-Sh...,4.0,2908,₹309,₹999
1,Jockey Mens Slub Casual Track Pant,sports & fitness,Running,https://m.media-amazon.com/images/I/61S5skYN95...,https://www.amazon.in/Jockey-Cotton-Track-9510...,4.1,5550,"₹1,186","₹1,399"
2,ASIAN Men's Skypee-162 Casual Sneaker Lightwei...,sports & fitness,Running,https://m.media-amazon.com/images/I/61O41FwCQF...,https://www.amazon.in/ASIAN-Skypy-162-Black-Ca...,3.8,4379,₹459,₹699
3,Campus Mens Jasper Running Shoes,sports & fitness,Running,https://m.media-amazon.com/images/I/610+hWjuh1...,https://www.amazon.in/Campus-Jasper-Running-Sh...,3.8,1985,₹897,"₹1,499"
4,"YS, Women Cotton Padded Wire Free Sports Bra F...",sports & fitness,Running,https://m.media-amazon.com/images/I/41QBkoB3vA...,https://www.amazon.in/YS-Cotton-Padded-Fitness...,5.0,10,₹499,₹899


In [89]:
# Colunas dos nossos conjuntos de dados
final_dic['Running'].columns

Index(['name', 'main_category', 'sub_category', 'image', 'link', 'ratings',
       'no_of_ratings', 'discount_price', 'actual_price'],
      dtype='object')

In [90]:
# Tipos dos nosso valores
final_dic['Running'].dtypes

name              object
main_category     object
sub_category      object
image             object
link              object
ratings           object
no_of_ratings     object
discount_price    object
actual_price      object
dtype: object

## Tratando valores NULOS

In [91]:
# Vamos verificar quais os atributos contém valores nulos nos datasets
for i in list(final_dic.keys()):
    print(list((final_dic[i].isna().sum() > 0).where((final_dic[i].isna().sum()>0)).dropna().keys()))

['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_price']
['ratings', 'no_of_ratings', 'discount_price', 'actual_

Podemos perceber que os seguintes atributos, tem pelo menos 1 instância NULA: ratings, no_of_ratings, discount_price, actual_price.


Logo a ideia para o pre-processamento é a seguinte: Primeiro, dropar as instâncias que tiveram o atributo "actual_price" nulo, depois se ainda tivermos valores NULOS iremos: transformar no_of_ratings e ratings em valores numericos, tratando casos que possam conter valores não numericos neste atributo, já que ele está como object, e depois inserir aos valores NULOS a média, e com isso teremos o tratamento de valores nulos. E finalmente, para os valores NULOS do atributo discount_price iremos adicionar o valor "0" apenas para não deixar nulos, e dessa forma evitar a inserção de valores NULOS no mongoDB

In [100]:
# BRILHA GEOVANNE

# Tratando o preço

Nessa etapa, iremos tratar o atributo do preço que está como objeto devido a ter o valor de libras à frente.

In [10]:
# Cria as coleções para inserir no MongoDB
collections = {}

for keys in final_dic.keys():
    collections[keys] = db_itens.get_collection(keys)

In [109]:
def insertInColection_db(col_name,df,colection):
    main_fdata = []
    
    columns = df.columns
    ['name', 'main_category', 'sub_category', 'image', 'link', 'ratings',
       'no_of_ratings', 'discount_price', 'actual_price']
    rows,cols = df.shape
    
    for index, row in df.iterrows():
            fdata = {}
            for i in range(cols):
                    fdata[columns[i]] = row[columns[i]]
                    if columns[i] == 'no_of_ratings':
                        try:
                            fdata["rating_levance"] = float(row['ratings']) * int(row['no_of_ratings'])
                        except:
                            fdata["rating_levance"] = 0
            main_fdata.append(fdata)

    with timeout(100):
        colection.insert_many(main_fdata)


    print("...")
    return

In [110]:
insertInColection_db("Running",final_dic["Running"],collections["Running"])

...


In [25]:
for post in collections["Running"].find({}):
    pprint.pprint(post)

{'_id': ObjectId('653c47fa14d3a9ef7ecbe05d'),
 'actual_price': '₹999',
 'discount_price': '₹309',
 'image': 'https://m.media-amazon.com/images/I/7148IlzWBVL._AC_UL320_.jpg',
 'link': 'https://www.amazon.in/Amazon-Brand-Symbol-T-Shirt-AW17-SYSP-03B_Medium_Viridian/dp/B071GTQH6K/ref=sr_1_313?qid=1679217352&s=sports&sr=1-313',
 'main_category': 'sports & fitness',
 'name': "Amazon Brand - Symactive Men's Regular Fit T-Shirt",
 'no_of_ratings': '2,908',
 'ratings': '4.0',
 'sub_category': 'Running'}
{'_id': ObjectId('653c4f8214d3a9ef7ecbe05e'),
 'actual_price': '₹999',
 'discount_price': '₹309',
 'image': 'https://m.media-amazon.com/images/I/7148IlzWBVL._AC_UL320_.jpg',
 'link': 'https://www.amazon.in/Amazon-Brand-Symbol-T-Shirt-AW17-SYSP-03B_Medium_Viridian/dp/B071GTQH6K/ref=sr_1_313?qid=1679217352&s=sports&sr=1-313',
 'main_category': 'sports & fitness',
 'name': "Amazon Brand - Symactive Men's Regular Fit T-Shirt",
 'no_of_ratings': '2,908',
 'rating_levance': 0,
 'ratings': '4.0',
 'su