In [16]:
import numpy as np
import pandas as pd
from scipy import stats

In [17]:
import unittest
from pandas import testing as tm

In [18]:
csv_data = pd.DataFrame({'col 1': range(1, 7, 2)})
csv_data

Unnamed: 0,col 1
0,1
1,3
2,5


In [19]:
class Standardize:
    def __init__(self, df_data):
        self.data = df_data
    
    def _fit(self):
        self.data['std'] = stats.zscore(self.data['col 1'])
        return self
    
    def _transform(self, col_name, to_add):
        self.data[col_name] = self.data[col_name].transform(lambda x: x + to_add)
        return self
    
    def _inverse_transform(self, col_name, to_subtract):
        self.data[col_name] = self.data[col_name].transform(lambda x: x - to_subtract) 
        return self

In [20]:
x = Standardize(csv_data)._fit()

In [21]:
x._transform('col 1', 2)
x.data

Unnamed: 0,col 1,std
0,3,-1.224745
1,5,0.0
2,7,1.224745


In [22]:
x._inverse_transform('col 1', 4)
x.data

Unnamed: 0,col 1,std
0,-1,-1.224745
1,1,0.0
2,3,1.224745


In [23]:
x.data['std']

0   -1.224745
1    0.000000
2    1.224745
Name: std, dtype: float64

In [24]:
import unittest

class TestStringMethods(unittest.TestCase):

    def test_fit(self):
        left =  Standardize(pd.DataFrame({'col 1': range(1, 7, 2)}))._fit().data['std'].to_list()
        print(left)
        self.assertEqual(
            left, 
            [-1.224744871391589, 0.0, 1.224744871391589])
    
    def test_transform(self):
        to_transform = 2
        col = 'col 1'
        right = Standardize(pd.DataFrame({col: range(1, 7, 2)})).data[col].to_list()
        left = Standardize(pd.DataFrame({col: range(1, 7, 2)}))._transform(col, to_transform)
        left = left._inverse_transform(col, to_transform).data[col].to_list()
        
        self.assertEqual(left, right)
        
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

..

[-1.224744871391589, 0.0, 1.224744871391589]



----------------------------------------------------------------------
Ran 2 tests in 0.004s

OK


In [25]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from typing import List

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gabi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gabi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gabi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
sentence_data = "The First sentence is about Python. The Second: about Django. You can learn Python,Django and Data Ananlysis here. "

In [27]:
def tokenize_data(data):
    return nltk.sent_tokenize(data)

def rmv_punct(data):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for p in punc:
        if p in data:
            data = data.replace(p, "")
    
    return data

def rmv_stop(data):
    stop_words = set(stopwords.words('english'))
 
    word_tokens = word_tokenize(data)

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

    filtered_sentence = []

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
            
    return filtered_sentence

def lemmatize(data):
    lemmatizer = WordNetLemmatizer()
    
    return lemmatizer.lemmatize(data)


print(tokenize_data(sentence_data))
print(rmv_punct(sentence_data))
print(rmv_stop(sentence_data))
print(lemmatize(sentence_data))

['The First sentence is about Python.', 'The Second: about Django.', 'You can learn Python,Django and Data Ananlysis here.']
The First sentence is about Python The Second about Django You can learn PythonDjango and Data Ananlysis here 
['The', 'First', 'sentence', 'Python', '.', 'The', 'Second', ':', 'Django', '.', 'You', 'learn', 'Python', ',', 'Django', 'Data', 'Ananlysis', '.']
The First sentence is about Python. The Second: about Django. You can learn Python,Django and Data Ananlysis here. 


In [28]:
def multiple_tokenize(data: List, func_to_apply):
    for i, v in enumerate(data):
        data[i] = func_to_apply(data[i])
    
    return data

In [29]:
list_of_data = [sentence_data for i in range(3)]
multiple_tokenize(list_of_data, rmv_stop)

[['The',
  'First',
  'sentence',
  'Python',
  '.',
  'The',
  'Second',
  ':',
  'Django',
  '.',
  'You',
  'learn',
  'Python',
  ',',
  'Django',
  'Data',
  'Ananlysis',
  '.'],
 ['The',
  'First',
  'sentence',
  'Python',
  '.',
  'The',
  'Second',
  ':',
  'Django',
  '.',
  'You',
  'learn',
  'Python',
  ',',
  'Django',
  'Data',
  'Ananlysis',
  '.'],
 ['The',
  'First',
  'sentence',
  'Python',
  '.',
  'The',
  'Second',
  ':',
  'Django',
  '.',
  'You',
  'learn',
  'Python',
  ',',
  'Django',
  'Data',
  'Ananlysis',
  '.']]

In [30]:
import unittest

class TestStringMethods(unittest.TestCase):

    def test_multiple_tokenize(self, func=rmv_stop):
        right = [['The',
  'First',
  'sentence',
  'Python',
  '.',
  'The',
  'Second',
  ':',
  'Django',
  '.',
  'You',
  'learn',
  'Python',
  ',',
  'Django',
  'Data',
  'Ananlysis',
  '.'],
 ['The',
  'First',
  'sentence',
  'Python',
  '.',
  'The',
  'Second',
  ':',
  'Django',
  '.',
  'You',
  'learn',
  'Python',
  ',',
  'Django',
  'Data',
  'Ananlysis',
  '.'],
 ['The',
  'First',
  'sentence',
  'Python',
  '.',
  'The',
  'Second',
  ':',
  'Django',
  '.',
  'You',
  'learn',
  'Python',
  ',',
  'Django',
  'Data',
  'Ananlysis',
  '.']]
        left = ['The First sentence is about Python. The Second: about Django. You can learn Python,Django and Data Ananlysis here. ',
 'The First sentence is about Python. The Second: about Django. You can learn Python,Django and Data Ananlysis here. ',
 'The First sentence is about Python. The Second: about Django. You can learn Python,Django and Data Ananlysis here. ']
        
        self.assertEqual(multiple_tokenize(left, func), right)
        
        
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'])

.
----------------------------------------------------------------------
Ran 1 test in 0.003s

OK


SystemExit: False

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
