In [1]:
import os, sys
%pwd

'/config/workspace/research'

In [2]:
os.chdir("../")
%pwd

'/config/workspace'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [4]:
from textClassification.constants import *
from textClassification.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [6]:
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /config/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.stemmer = nltk.SnowballStemmer("english")
        self.stopword = set(stopwords.words('english'))

    
    # Let's apply regex and do cleaning.
    def data_cleaning(self,words):
        words = str(words).lower()
        words = re.sub('\[.*?\]', '', words)
        words = re.sub('https?://\S+|www\.\S+', '', words)
        words = re.sub('<.*?>+', '', words)
        words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
        words = re.sub('\n', '', words)
        words = re.sub('\w*\d\w*', '', words)
        words = [word for word in words.split(' ') if words not in self.stopword]
        words=" ".join(words)
        words = [self.stemmer.stem(words) for word in words.split(' ')]
        words=" ".join(words)

        return words
    

    def clean_and_transform(self):

        data = pd.read_csv(os.path.join(self.config.data_path,"data.csv"))
        
        data["Sentiment"].replace({'neutral':0},inplace=True)
        data["Sentiment"].replace({'positive':1}, inplace = True)
        data["Sentiment"].replace({'negative':2}, inplace = True)
        
        data['Sentence'] = data['Sentence'].apply(self.data_cleaning)

        data.to_csv(os.path.join(self.config.root_dir,'main_df.csv'), index=False)

In [8]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.clean_and_transform()
except Exception as e:
    raise e

[2024-03-17 20:18:29,466: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-03-17 20:18:29,468: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-17 20:18:29,469: INFO: common: created directory at: artifacts]
[2024-03-17 20:18:29,471: INFO: common: created directory at: artifacts/data_transformation]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Sentiment"].replace({'neutral':0},inplace=True)
  data["Sentiment"].replace({'negative':2}, inplace = True)
