In [None]:
import json
import os
from os import listdir
from os.path import isfile, join
import gzip
import re
import pandas as pd
from collections import Counter

### 2021

In [None]:
# cases directory
# all cases per year (this data is available upon request)
dir_path = "/data_out/cases_gzip/2021"

In [None]:
compressed_cases = [join(dir_path, f) for f in listdir(dir_path)
                   if isfile(join(dir_path, f))
                   and f.endswith(".json")]

In [None]:
len(compressed_cases)

In [None]:
keywords = ["((сексуальн)|(нетрадицион))\w{2,5}\s(ориентац)",
            "(гомосек)\w*",
            "(лесби)\w*",
            "(бисекс)\w*",
            "(трансвест)\w*",
            "(транссекс)\w*",
            "(трансгендер)\w*",
            "(\\bтравести)\w*",
            "(\\bтрансух)\w*",
            "(транс)о?(фоб)\w*",
            "(ЛГБТ)\+?(\w*)",
            "(мужелож)\w*",
            "(гомофоб)\w*",
            "(\\bсодоми)\w*"]

In [None]:
def search_keywords(keywords:list, doc:str) -> tuple:
    '''
    Searching for a list of keywords in a target text
    Returns a tuple with N matches and the matches found
    '''
    
    keywords_foud = []
    
    for keyword in keywords:
        match = re.search(keyword,doc,flags=re.IGNORECASE)
        if match != None:
            keywords_foud.append(match[0])
            
    tuple_to_return = (len(keywords_foud), keywords_foud)
    
    return tuple_to_return

In [None]:
def find_cases_by_keywords(keywords:list, path_to_gzip:str, path_to_save="", year="") -> str:
    
    with gzip.open(path_to_gzip) as jf:
        data_by_region = json.loads(jf.read().decode("utf-8"))
        
    n_cases_found = 0
    region_code = path_to_gzip.split('/')[-1].split('_')[0]
        
    for website, cases in data_by_region.items():
        k = 0
        if cases.get("cases") != None:
            for doc in cases["cases"]:
                
                if doc.get("case_text") != None:
                    target_text = doc["case_text"]
                else:
                    target_text = ''
                
                matches = search_keywords(keywords,target_text)
                
                if matches[0] > 0:
                    k += 1
                    n_cases_found += 1
                    # add a website
                    # add matches
                    doc["website"] = website
                    doc["keyword_mathces"] = matches[1]

                    # setting a case id
                    if doc.get("case_id_uid") != None:
                        case_id = doc["case_id_uid"]
                    else:
                        # N_region_code
                        case_id = f"{k}_{region_code}"

                    # save matching cases (as separate json files)    
                    with open(f'{path_to_save}/{year}_{region_code}_{case_id}.json', 'w') as f:
                        json.dump(doc,f,ensure_ascii=False)
                        
    return f"{n_cases_found} cases are found in region {region_code}"

In [None]:
path_to_save = "sudrf_keyword_search"
year = "2021"

In [None]:
for compressed_case in compressed_cases[52:]:
    region_code = compressed_case.split('/')[-1].split('_')[0]
    find_cases_by_keywords(keywords,compressed_case,path_to_save,year=year)
    print(f"region {region_code} is completed")

In [None]:
### The same procedure was applied for 2022 cases

### Keyword search in Moscow cases

In [None]:
def find_moscow_cases_by_keywords(keywords:list, path_to_cases:str, path_to_save="", year="") -> str:
    
    all_txt_files = [join(path_to_cases, f) for f in listdir(path_to_cases)
             if isfile(join(path_to_cases, f))
             and f.endswith(".txt")]
        
    n_cases_found = 0
    
    for f in all_txt_files:
        
        with open(f,'r') as txt_file:
            case_text = txt_file.read()
        
        matches = search_keywords(keywords,case_text)
                
        if matches[0] > 0:
            n_cases_found += 1
            
            # shape the resulting file
            result = {}
            result["case_text"] = case_text
            result["website"] = f.split('/')[-1].split('_')[-2]
            result["keyword_mathces"] = matches[1]

            # setting a case id
            case_id = f.split('/')[-1].rstrip('.txt')

            # save matching cases (as separate json files)
            # region code 77 is const for Moscow 
            with open(f'{path_to_save}/{year}_77_{case_id}.json', 'w') as jf:
                json.dump(result,jf,ensure_ascii=False)

    return f"{n_cases_found} cases are found ({year})"

In [None]:
# 2021
path_to_save = 'sudrf_keyword_search'
year = '2021'
path_to_cases = 'msk/2021_txt'

In [None]:
find_moscow_cases_by_keywords(keywords, path_to_cases, path_to_save, year)

### Creating a df from jsons

In [None]:
path_to_cases = 'sudrf_keyword_search'

In [None]:
all_cases_with_keywords = [join(path_to_cases, f) for f in listdir(path_to_cases)
             if isfile(join(path_to_cases, f))
             and f.endswith(".json")]

In [None]:
kw_cases_df = pd.DataFrame(columns=['case_id','context','article_list','keywords'])

In [None]:
for case in all_cases_with_keywords:
    
    with open(case,'r') as jf:
        doc = json.load(jf)
        
    # fields
    if doc.get('metadata') != None:
        all_art = []
        for a in doc['metadata']['accused']:
            all_art.extend(a['article'])
        art_str = ''
        for art in set(all_art):
            art_str += f'{art},'
            
        if doc.get('case_id_uid') != None:
            case_id = doc['case_id_uid']
    else:
        case_id = case.split('/')[-1].rstrip('.txt')
        art_str = ''
        
    # get context window
    contexts = []
    for kw in doc['keyword_mathces']:
        to_search = "[\w\W]{200}" + kw + "[\w\W]{200}"
        context = re.search(to_search, doc['case_text'])
        contexts.append(context[0])
        
    data_row = [case_id,contexts,art_str.rstrip(','),doc['keyword_mathces']]
    kw_cases_df.loc[len(kw_cases_df)] = data_row

In [None]:
all_articles = []
for a in list_of_all_art:
    matches = re.findall('(ст.\s?\d{2,3})',a)
    all_articles.extend(matches)

In [None]:
counted_art = Counter(all_articles)

In [None]:
counted_art.most_common()

In [None]:
list_of_all_kws = list(kw_cases_df['keywords'])

In [None]:
list_of_all_kws

In [None]:
all_kw = []
for kws in list_of_all_kws:
    for k in kws:
        all_kw.append(k)

In [None]:
Counter(all_kw).most_common()

In [None]:
kw_cases_df.to_csv('keyword_search_context_df.csv')

In [None]:
### function to view the case text

In [None]:
def print_text(case_id:str) -> str:
    
    dir_path = "/Users/Macintosh/Library/Mobile Documents/com~apple~CloudDocs/data_out/sudrf_keyword_search"
    all_cases_with_keywords = [join(dir_path, f) for f in listdir(dir_path)
             if isfile(join(dir_path, f))
             and f.endswith(".json")]
    
    for case_file_name in all_cases_with_keywords:
        if case_id in case_file_name:
            with open(case_file_name,'r') as jf:
                doc = json.load(jf)
                
    return doc["case_text"]