<a href="https://colab.research.google.com/github/componavt/sns4human/blob/main/src/vk/vk_group_all_posts_gzip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Getting all posts from a VK group by a given domain name, a gzipped .csv.gz file.

Получение всех постов из группы ВК по заданному доменному имени, сжатый .csv.gz файл.

In [66]:
import gzip
import shutil

import pandas as pd
import requests
import csv  # for quoting
from config import TOKEN, VERSION

In [67]:
def save_compressed_csv(df, file_name):
    # Saving compressed CSV in .gz format
    with gzip.open(file_name, 'wt', encoding='utf-8') as f:
        df.to_csv(f, index=False, quoting=csv.QUOTE_MINIMAL, quotechar='"',
                  columns=['id', 'text', 'date', 'likes', 'reposts', 'views'],
                  escapechar='\\')

In [68]:
def get_all_posts(token, version, domain):
    offset = 0
    all_posts = []

    while True:
        response = requests.get(
            'https://api.vk.com/method/wall.get',
            params={
                'access_token': token,
                'v': version,
                'domain': domain,
                'offset': offset,
                'count': 100
            }
        )
        data = response.json()

        if 'response' in data:
            posts = data['response']['items']
            all_posts.extend(posts)
            if len(posts) < 100:
                return all_posts
            offset += 100
        elif 'error' in data:
            print(f"Error: {data['error']['error_msg']}")
            return []
        else:
            return []

In [69]:
def create_all_posts_csv(token, version, domain):
    posts = get_all_posts(token, version, domain)
    if not posts:
        print("No posts retrieved.")
        return

    df = pd.DataFrame(posts)

    # Process possible empty values in likes, reposts, views
    df['likes'] = df['likes'].apply(lambda x: int(x['count']) if isinstance(x, dict) else 0)
    df['reposts'] = df['reposts'].apply(lambda x: int(x['count']) if isinstance(x, dict) else 0)
    df['views'] = df['views'].apply(lambda x: int(x['count']) if isinstance(x, dict) else 0)

    df = df[['id', 'text', 'date', 'likes', 'reposts', 'views']].copy()

    # Process date field
    df['date'] = (
      pd.to_datetime(df['date'], unit='s')
      .dt.tz_localize('UTC')
      .dt.tz_convert('Europe/Moscow')
      .dt.strftime('%Y-%m-%d %H:%M:%S')
    )

    # Remove newline characters in text
    df.loc[:, 'text'] = df['text'].str.replace('\n', ' ', regex=False)

    # Save compressed CSV file (Save CSV with quotes only for the 'text' column)
    compressed_name = domain + ".csv.gz"
    save_compressed_csv(df, compressed_name)


In [70]:
domain = 'pravmk'

In [71]:
create_all_posts_csv(TOKEN, VERSION, domain)