In [1]:
from text_analyzer.file_manager import FileManager
from collections import Counter
from nltk.util import ngrams
import json
import re

class Analyzer:

    def __init__(self, data):
        self.data = self._preprocess_data(data)
        self.char_number = self._calculate_char_number()
        self.word_number = self._calculate_word_number()
        self.non_alpha_chars = self._count_non_alpha_chars()
        self.ngram_dict = self._calculate_most_used_ngrams()
    

    def _preprocess_data(self, data):
        """Preprocess data and add preprocessed text column to data"""
        data['preprocessed_text'] = data['text'].apply(lambda x: x.lower())
        data['processed_text'] = data['text'].apply(self._preprocessing)
        return data
    
    def _preprocessing(self, text):
        """lower, remove punctuations, remove numbers, remove whitespaces. Use regex."""
        text = text.lower()

        text = re.sub(r'[^\w\s]','', text)
        text = re.sub(r'\d','', text)
        text = re.sub(r'\s+',' ', text)
        text = re.sub(r' +', ' ', text)
        text = re.sub(r'\n', ' ', text)
        return text.strip()
    
    def _calculate_char_number(self):
        """calculate number of chars for each doc. Return min, max, mean values as dict"""
        self.data['n_char'] = self.data['text'].apply(lambda x: len(x))
        return self.data['n_char'].agg(['min', 'mean', 'max']).to_dict()
    
    def _calculate_word_number(self):
        """calculate number of words for each doc. Return min, max, mean values as dict"""
        self.data['n_word'] = self.data['text'].apply(lambda x: len(x.split()))
        return self.data['n_word'].agg(['min', 'mean', 'max']).to_dict()
    
    def _count_non_alpha_chars(self, n=10):
        non_alpha_chars = Counter()
        total_non_alpha_count = 0
        
        for text in self.data['text']:
            non_alpha = [char for char in text if not char.isalpha() and not char.isspace()]
            non_alpha_chars.update(non_alpha)
            total_non_alpha_count += len(non_alpha)
        
        return {"10_most_common_with_freq": dict(non_alpha_chars.most_common(n)),
                "total_count": total_non_alpha_count}
    
    def _calculate_most_used_ngrams(self, n_min=1, n_max=3, first_k=10):
        """calculate most used ngrams for each doc. Return dict of ngrams. Keys are n values. Values are dicts of ngrams and their frequencies. n_min and n_max are used to determine n values. first_k is used to determine how many ngrams will be returned."""
        text_splitted = [word for sentence in self.data['preprocessed_text'].tolist() for word in sentence.split()]
        ngram_dict = dict()
        for n in range(n_min, n_max+1):
            ngram = Counter(ngrams(text_splitted, n)).most_common(first_k)
            ngram_freq = {" ".join(phrase): freq for phrase, freq in ngram}
            ngram_dict[n] = ngram_freq

        return ngram_dict

    def generate_word_cloud(self, use_processed_data=True, save=False, output_name='word_cloud.png'):
        """import wordcloud library and generate a single word cloud with all the documents"""
        from wordcloud import WordCloud
        import matplotlib.pyplot as plt

        data = self.data['processed_text'].tolist() if use_processed_data else self.data['text'].tolist()

        world_cloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(" ".join(data))
        plt.axis('off')
        plt.imshow(world_cloud, interpolation='bilinear')

        if save:
            world_cloud.to_file(output_name)

        plt.show()

    def _get_simple_stats(self):

        n_char_data = self.calculate_char_number()
        n_word_data = self.calculate_word_number()
        non_alpha_data = self.count_non_alpha_chars()

        #TODO ways to get number for ï like letters.
        #TODO: # of links
        #TODO: # of phone numbers
    
    def print_stats(self, pretty=True):
        analyzer_dict = self.__dict__.copy()
        analyzer_dict.pop('data')
        
        res = json.dumps(analyzer_dict, indent=4 if pretty else None)
        print(res)
        
    def to_json(self, output_name):
        analyzer_dict = self.__dict__.copy()
        analyzer_dict.pop('data')
        
        if not output_name.endswith('.json'):
            output_name+='.json'
        
        with open(output_name, 'w', encoding='utf-8') as f:
            json.dump(analyzer_dict, f, indent=4, ensure_ascii=False)

# fm = FileManager()
# data = fm.read_txt('/Users/fatih/Desktop/val.txt')
# print(data.iloc[-1, :])

# analyzer = Analyzer(data)
# analyzer.print_stats()
# analyzer.generate_word_cloud(save=True, output_name='word_cloud.png')

                                                text
0  Lorem ipsum dolor sit amet, consectetur adipis...
1  Pellentesque habitant morbi tristique senectus...
2  Quisque suscipit elit nec convallis ullamcorpe...
3  Lorem ipsum dolor sit amet, çonsectetur adipis...
4  Pellentesque habitant morbi tristique senectus...


In [2]:
anlyz = Analyzer(data)
# anlyz.data.apply(anlyz.calculate_simple_stats, axis=1)
# anlyz.count_non_ascii_chars()

In [3]:
anlyz.generate_word_cloud()

ModuleNotFoundError: No module named 'wordcloud'

In [42]:
anlyz.print_stats()

{
    "char_number": {
        "min": 390.0,
        "mean": 507.75,
        "max": 610.0
    },
    "word_number": {
        "min": 58.0,
        "mean": 74.25,
        "max": 91.0
    },
    "non_alpha_chars": {
        "10_most_common_with_freq": {
            ".": 83,
            ",": 38,
            "5": 4,
            "/": 2,
            "1": 2,
            ":": 1,
            "+": 1,
            "(": 1,
            ")": 1,
            "2": 1
        },
        "total_non_alpha_count": 139
    },
    "ngram_dict": {
        "1": {
            "sed": 15,
            "nec": 15,
            "in": 15,
            "vitae": 13,
            "ut": 12,
            "ac": 12,
            "at": 11,
            "eu": 11,
            "sapien": 10,
            "arcu": 10
        },
        "2": {
            "nulla facilisi.": 6,
            "ut vestibulum": 4,
            "nec sapien": 4,
            "lorem ipsum": 3,
            "ipsum dolor": 3,
            "dolor sit": 3,
            "nec t

In [110]:
anlyz.to_json('testing')

In [16]:
from nltk.util import ngrams
from collections import Counter

testing = ['hello this is a testing sentence', 'this sentence is not related with anything', 'this sentence is created for test purposes', 'this is another list of sentences', 'another sentence bla bla']
testing = " ".join(testing)
testing_splitted = [word for word in testing.split()]
ngram = ngrams(testing_splitted, 2)
ngram = Counter(ngram).most_common(5)
print(ngram)

[(('this', 'is'), 2), (('this', 'sentence'), 2), (('sentence', 'is'), 2), (('hello', 'this'), 1), (('is', 'a'), 1)]


In [23]:
for phrase, freq in ngram:
    print(" ".join(phrase), freq)

this is 2
this sentence 2
sentence is 2
hello this 1
is a 1


In [22]:
dict(ngram)

{('this', 'is'): 2,
 ('this', 'sentence'): 2,
 ('sentence', 'is'): 2,
 ('hello', 'this'): 1,
 ('is', 'a'): 1}

In [41]:
anlyz.ngram_dict

{1: {'sed': 15,
  'nec': 15,
  'in': 15,
  'vitae': 13,
  'ut': 12,
  'ac': 12,
  'at': 11,
  'eu': 11,
  'sapien': 10,
  'arcu': 10},
 2: {'nulla facilisi.': 6,
  'ut vestibulum': 4,
  'nec sapien': 4,
  'lorem ipsum': 3,
  'ipsum dolor': 3,
  'dolor sit': 3,
  'nec turpis': 3,
  'ac fringilla': 3,
  'vivamus consequat': 3,
  'ac ultricies': 3},
 3: {'lorem ipsum dolor': 3,
  'ipsum dolor sit': 3,
  'dolor sit amet,': 2,
  'amet, consectetur adipiscing': 2,
  'ligula. fusce tincidunt': 2,
  'fusce tincidunt odio': 2,
  'tincidunt odio vitae': 2,
  'odio vitae nisl': 2,
  'vitae nisl commodo': 2,
  'nisl commodo hendrerit.': 2}}