In [None]:
import os
import re

import pandas as pd
import numpy as np
import get_tag_data as gt
import matplotlib.pyplot as plt

#### Загружаем отобранные тэги

In [None]:
tags = pd.read_csv("./data/tags/mk_tags_freq.csv", index_col=0)
tags.drop(tags.iloc[:2].index, inplace=True)
tags.reset_index(inplace=True, drop=True)
path = './data/tags/posts_mk/'

In [None]:
tags.head()

In [None]:
tags.shape

# TODO: Переписать функции в модуль

Собираем данные в csv

In [None]:
def harvest_tags(tags, path):
    """Dumps data searched by instagram tags to path"""
    
    for index, row in tags.iterrows():
        tag = row["tag"]
        prefix = f"{tag}_{index}"
        gt.get_tag_data(tag, dump=True, path=path, prefix=prefix)

Сливаем файлы в одну таблицу

In [None]:
def merge_from_path(path):
    """Merges all csv tables from path into one pandas.DataFrame"""
    
    table = pd.DataFrame()
    for file_name in next(os.walk(path))[2]:
        next_table = pd.read_csv(os.path.join(path, file_name), sep=";", index_col=0, engine="python", encoding="utf-8")
        next_table["by_tag"] = file_name.split("_")[0]
        print(f"Next table shape: {next_table.shape}")
        table = pd.concat([table, next_table], sort=False)

    data = table.reset_index().drop(columns=["index"]).drop_duplicates(subset="post_id")
    return data

Pipeline

In [None]:
def add_contacts(df):
    """Adds phone numbers and bollean direct features"""
    
    df = df.copy()
    contacts_pat = re.compile("([\+7|7|8]?[\s\-]?\(?[489][0-9]{2}\)?[\s\-]?[0-9]{3}[\s\-]?[0-9]{2}[\s\-]?[0-9]{2})|(директ)")
    df[["phone_number", "direct"]] = df["text"].str.extract(contacts_pat)
    return df

    
def add_price(df):
    """Adds price feature"""
    
    df = df.copy()
    price_pat = re.compile("([\d]+[0]{1})\s?[р]{1}")  # "420 р" and "4000р" are prices, "421" is not
    df["price"] = (df["text"].str.lower()
                   .str.extract(price_pat)
                   .fillna(0)
                   .iloc[:, 0]  # convert to Series
                   .map(lambda x: 0 if isinstance(x, str) and len(x) > 5  # remove prices more than 99999 to escape overflow
                        else x)
                   .astype("int64"))
    return df

    
def filter_workshops(df):
    """Remove all rows that are not workshops"""
    
    contact_filter = df[["phone_number", "direct"]].notnull().any(axis=1)
    price_filter = df["price"].between(500, 10000)
    return df[contact_filter & price_filter]

In [None]:
%%time

main_data = (data.pipe(add_contacts)
                .pipe(add_price)
                .pipe(filter_workshops))

main_data.reset_index(drop=True, inplace=True)

In [None]:
print(main_data.shape)

main_data.head()

In [None]:
dump_path = "./data/"
main_data.to_csv(os.path.join(dump_path, "main_data.csv"), sep=";", encoding="utf-8", index=False)

#### Случайная выборка из постов для ручной проверки

In [None]:
pd.Series(np.random.choice(test_data["post_id"], size=100)).to_csv("./data/main_random_posts.csv")