In [108]:
from text_analyzer.file_manager import FileManager
from collections import Counter
import json


class Analyzer:

    def __init__(self, data):
        self.data = data
        self.char_number = self.calculate_char_number()
        self.word_number = self.calculate_word_number()
        self.non_alpha_chars = self.count_non_alpha_chars()

    def calculate_char_number(self):
        """calculate number of chars for each doc. Return min, max, mean values as dict"""
        self.data['n_char'] = self.data['text'].apply(lambda x: len(x))
        return self.data['n_char'].agg(['min', 'mean', 'max']).to_dict()
    
    def calculate_word_number(self):
        """calculate number of words for each doc. Return min, max, mean values as dict"""
        self.data['n_word'] = self.data['text'].apply(lambda x: len(x.split()))
        return self.data['n_word'].agg(['min', 'mean', 'max']).to_dict()
    
    def count_non_alpha_chars(self):
        non_alpha_chars = Counter()
        total_non_alpha_count = 0
        
        for text in self.data['text']:
            non_alpha = [char for char in text if not char.isalpha() and not char.isspace()]
            non_alpha_chars.update(non_alpha)
            total_non_alpha_count += len(non_alpha)
        
        return {"10_most_common_with_freq": dict(non_alpha_chars.most_common(10)),
                "total_non_alpha_count": total_non_alpha_count}
    
    def get_simple_stats(self):

        n_char_data = self.calculate_char_number()
        n_word_data = self.calculate_word_number()
        non_alpha_data = self.count_non_alpha_chars()

        #TODO ways to get number for ï like letters.
        #TODO: # of links
        #TODO: # of phone numbers
    
    def print_stats(self, pretty=True):
        analyzer_dict = self.__dict__.copy()
        analyzer_dict.pop('data')
        
        res = json.dumps(analyzer_dict, indent=4 if pretty else None)
        print(res)
        
    def to_json(self, output_name):
        analyzer_dict = self.__dict__.copy()
        analyzer_dict.pop('data')
        
        if not output_name.endswith('.json'):
            output_name+='.json'
        
        with open(output_name, 'w', encoding='utf-8') as f:
            json.dump(analyzer_dict, f, indent=4, ensure_ascii=False)
        


fm = FileManager()
data = fm.read_txt('/Users/fatih/Desktop/val.txt')
print(data.head())

                                                text
0  Lorem ipsum dolor sit amet, consectetur adipis...
1  Pellentesque habitant morbi tristique senectus...
2  Quisque suscipit elit nec convallis ullamcorpe...
3  Lorem ipsum dolor sit amet, çonsectetur adipis...
4  Pellentesque habitant morbi tristique senectus...


In [22]:
data['n_words'] = data['text'].apply(lambda x: len(x))
data

Unnamed: 0,text,n_words
0,"Lorem ipsum dolor sit amet, consectetur adipis...",568
1,Pellentesque habitant morbi tristique senectus...,484
2,Quisque suscipit elit nec convallis ullamcorpe...,390
3,"Lorem ipsum dolor sit amet, çonsectetur adipis...",568
4,Pellentesque habitant morbi tristique senectus...,484
5,Quisque suscipit elit nec convallis ullamcorpe...,414
6,"Amet, consectetur adipiscing +1 (555) 123-4567...",610
7,Nam eget orci in ipsum interdum tristique at v...,544


In [71]:
anlyz = Analyzer(data)
# anlyz.data.apply(anlyz.calculate_simple_stats, axis=1)
# anlyz.count_non_ascii_chars()

AttributeError: 'Analyzer' object has no attribute 'count_non_ascii_chars'

In [109]:
anlyz = Analyzer(data)
anlyz.print_stats()

{
    "char_number": {
        "min": 390.0,
        "mean": 507.75,
        "max": 610.0
    },
    "word_number": {
        "min": 58.0,
        "mean": 74.25,
        "max": 91.0
    },
    "non_alpha_chars": {
        "10_most_common_with_freq": {
            ".": 83,
            ",": 38,
            "5": 4,
            "/": 2,
            "1": 2,
            ":": 1,
            "+": 1,
            "(": 1,
            ")": 1,
            "2": 1
        },
        "total_non_alpha_count": 139
    }
}


In [110]:
anlyz.to_json('testing')