In [1]:
#import necesary library
import re
import pandas as pd
import sqlite3

from flask import Flask, jsonify, request
from flasgger import Swagger, swag_from, LazyString, LazyJSONEncoder 
from flasgger import swag_from

In [2]:
#default flask and swagger setting
app = Flask(__name__)
app.json_encoder = LazyJSONEncoder
swagger_template = dict(
    info = {
        'title': LazyString(lambda: 'API Documentation for Data Cleansing'),
        'version': LazyString(lambda: '1.0.0'),
        'description': LazyString(lambda: 'API Documentation for Data Cleansing'),
    },
    host = LazyString(lambda: request.host)
)

swagger_config = {
    "headers": [],
    "specs": [
        {
            "endpoint": 'docs',
            "route": '/docs.json',
        }
    ],
    "static_url_path": "/flasgger_static",
    "swagger_ui": True,
    "specs_route": "/docs/"
}
swagger = Swagger(app, template=swagger_template, config=swagger_config)

In [None]:
#define endpoints: get methods
@swag_from("/Users/feybearsella_m/Documents/Binar Challenge Wave 15/hello_world.yml", methods=['GET'])
@app.route('/', methods=['GET'])
def hello_world():
    json_response = {
        'status_code': 200,
        'description': "Welcome to API data cleansing",
        'data': "Welcome to API data cleansing",  
     }
    response_data = jsonify(json_response)
    return response_data

#define endpoints: post method for text processing
@swag_from("/Users/feybearsella_m/Documents/Binar Challenge Wave 15/text_processing.yml", methods=['POST'])
@app.route('/text-processing', methods=['POST'])
def text_processing():
    
    text = request.form.get('text')
    
    #to read abusive.csv 
    with open('abusive.csv', 'r', encoding='latin-1') as f:
        abusive_dict = {line.strip():'' for line in f}
    
    #to read new_kamusalay
    with open('new_kamusalay.csv', 'r', encoding='latin-1') as f:
        alay_dict = {}
        for line in f:
            line = line.strip()
            if not line:
                continue
            split_line = line.split(':')
            if len(split_line) != 2:
                continue
            alay_dict[split_line[0]] = split_line[1]
    
    #function to process the text
    #lowercase text
    text = text.lower()
    #remove URL
    text = re.sub(r'http\S+', '', text)   
    #remove RT
    text = re.sub(r'RT', '', text)
    #remove trailing and leading whitespace
    text = text.strip()
    #remove multiple whitespace
    text = re.sub(r'\\n',' ', text)
    #remove new line
    text = re.sub(r'\n', ' ',text)
    #remove space
    text = re.sub('  +', ' ',text)
    #remove punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    #remove mention
    text = re.sub(r'@\S+', '', text)
    #remove emoji
    text = re.sub(r'[\\x]+[a-z0-9]{2}', '', text)
    #remove hashtag
    text = re.sub(r'#([^\s]+)', '', text)
    #remove numeric number
    text = re.sub(r'\d+', '', text) 
    #remove word user
    text = text.replace('user', '')
    
    words = text.split()
    new_words = []
    for word in words:
        #ignore abusive words
        if word in abusive_dict:
            continue
        #replace the words
        if word in alay_dict:
            new_words.append(alay_dict[word])
        else:
            new_words.append(word)
    
    #cleansing text
    new_text = ' '.join(new_words)
  
    return jsonify({'cleaned_text': new_text})


#define endpoints: post method for file processing 
@swag_from("/Users/feybearsella_m/Documents/Binar Challenge Wave 15/file_processing.yml", methods=['POST'])
@app.route('/file-processing', methods=['POST'])
def file_processing():
    
    #import csv file to pandas dataframe 
    df_abusive = pd.read_csv('abusive.csv', encoding='latin-1', header=None)
    df_kamusalay = pd.read_csv('new_kamusalay.csv', encoding='latin-1', header=None)
    
    #get file from upload to dataframe
    file= request.files['file']
    print(f"Received file: {file.filename}")
    
    #import file object to pandas dataframe
    df = pd.read_csv(file, encoding='latin-1')
    print("DataFrame loaded successfully.")
    
    #set tweet column for dataframe
    df = df[['Tweet']]
    
    #function to drop duplicates 
    df.drop_duplicates(inplace=True)
    
    #function for new number of characters
    df['no_char'] = df['Tweet'].apply(len)
    
    #function for new number of words
    df['no_words'] = df['Tweet'].apply(lambda x: len(x.split()))
    
    #function to clean data 
    def tweet_cleansing(x):
        tweet = x
        #to remove non-alphabetic characters (excluding spaces) from string values in the Dataframe
        cleaned_tweet = re.sub('[^a-zA-Z\s]', '', tweet).strip()
        #to lowercase all string values in the Dataframe
        cleaned_tweet = cleaned_tweet.lower()
        #to remove words with three or fewer characters from string values in the Dataframe
        cleaned_tweet = re.sub(r'\b\w{1,3}\b', '', tweet).strip()
        #to remove words with 15 or more characters from string values in the Dataframe
        cleaned_tweet = re.sub(r'\b\w{15,}\b', '', tweet).strip()
        #to remove trailing and leading whitespace in string values of the Dataframe
        cleaned_tweet = cleaned_tweet.strip()
        #to remove the whitespace in string values of the Dataframe
        cleaned_tweet = re.sub(r'\s+', ' ', tweet).strip()
        #to remove emoji from string values in the Dataframe
        cleaned_tweet = re.sub(r'[\\x]+[a-z0-9]{2}', '', tweet).strip()
        #to remove hashtag from string values in the Dataframe
        cleaned_tweet = re.sub(r'#([^\s]+)', '', tweet).strip()
        #to replace user mentions (e.g., @username) with an empty string
        cleaned_tweet = re.sub(r'@\w+', '', tweet).strip()

        return cleaned_tweet
    
    #create new cleaned_tweet column
    df['cleaned_tweet'] = df['Tweet'].apply(tweet_cleansing)
    
    #ensure 'cleaned_tweet' column exists
    if 'cleaned_tweet' in df.columns:
        df['cleaned_tweet'] = df['cleaned_tweet'].astype(str)
    
    #create new no_word and no_char on cleaned_tweet column
    df['no_char_2'] = df['cleaned_tweet'].apply(len)
    df['no_words_2'] = df['cleaned_tweet'].apply(lambda x: len(str(x).split()))
    
    #function to remove abousive words
    def remove_abusive(cleaned_tweet, abusive_words):
        #convert cleaned_tweet to lowercase
        cleaned_tweet = cleaned_tweet.lower()
        #remove the abusive words
        for word in abusive_words:
            cleaned_tweet = re.sub('[^a-zA-Z\s]', '', cleaned_tweet).strip() #to remove non-alphabetic characters (excluding spaces)
            cleaned_tweet = cleaned_tweet.replace(word.lower(), '').strip() #to lowercase all string
            cleaned_tweet = re.sub(r'\b\w{1,3}\b', '', cleaned_tweet).strip() #to remove words with three or fewer characters
            cleaned_tweet = re.sub(r'\b\w{15,}\b', '', cleaned_tweet).strip() #to remove words with 15 or more characters
            cleaned_tweet = cleaned_tweet.strip() #to remove trailing and leading whitespace
            cleaned_tweet = re.sub(r'\s+', ' ', cleaned_tweet).strip() #to remove the whitespace in string
            cleaned_tweet = re.sub(r'\\[a-z0-9]{1,5}', '', cleaned_tweet).strip() # #to remove emoji
            cleaned_tweet = re.sub(r'@\w+', '', cleaned_tweet) #to replace user mentions (e.g., @username) with an empty string
            cleaned_tweet = re.sub(r'#([^\s]+)', '', cleaned_tweet).strip()  #to remove hashtag
             
        #split the cleaned tweet into words
        words = cleaned_tweet.split()
    
        #keep track of encountered words
        unique_words = set()
    
        #remove duplicate words and join back into a string
        cleaned_tweet = ' '.join(word for word in words if word not in unique_words and (unique_words.add(word) or True))
    
        return cleaned_tweet
    
    #function to count number of abusive words
    def count_abusive(x, df_abusive):
        cleaned_tweet = x
        matched_list = []
        for i in range(len(df_abusive)):
            for j in x.split():
                word = df_abusive['abusive'].iloc[i]
                if word == j.lower():
                    matched_list.append(word)
        return len(matched_list)
     
    #assuming df_abusive is a DataFrame containing abusive words
    df_abusive = pd.DataFrame({'abusive': ['alay', 'ampas', 'buta']})
    
    #remove abusive words
    df['cleaned_tweet'] = df.apply(lambda row: remove_abusive(row['cleaned_tweet'], df_abusive['abusive']), axis=1)

    #function to count abusive words and create new column
    df['estimated_no_abs_words'] = df['cleaned_tweet'].apply(lambda x: count_abusive(x, df_abusive))
    
    #to save the cleaned data to csv file
    df.to_csv('cleaned_data.csv', index=False)
    
    #connect and create new database
    database_path = '/Users/feybearsella_m/Documents/Binar Challenge Wave 15/data.db'
    with sqlite3.connect(database_path) as conn:
        q_create_table = """
        create table if not exists df (Tweet varchar(255), no_char int, no_words int, cleaned_tweet varchar(255), no_char_2 int, no_words_2 int);
        """
        conn.execute(q_create_table)
        conn.commit()

    #check table has data or not
        cursor = conn.execute("select count(*) from df")
        num_rows = cursor.fetchall()
        num_rows = num_rows[0][0]

    #insert the data if table has no data   
        if num_rows == 0:
        # DO ITERATIONS TO INSERT DATA (EACH ROW) FROM FINAL DATAFRAME (DF)
            for i in range(len(df)):
                tweet = df['Tweet'].iloc[i]
                no_char = int(df['no_char'].iloc[i])
                no_words = int(df['no_words'].iloc[i])
                cleaned_tweet = df['cleaned_tweet'].iloc[i]
                no_char_2 = int(df['no_char_2'].iloc[i])
                no_words_2 = int(df['no_words_2'].iloc[i])
    
                q_insertion = "insert into df (Tweet, no_char, no_words, cleaned_tweet, no_char_2, no_words_2) values (?,?,?,?,?,?)"
                conn.execute(q_insertion, (tweet, no_char, no_words, cleaned_tweet, no_char_2, no_words_2))
   
    
    conn.close()    
    
    json_response = {
        'status_code' : 200,
        'description' : "File has been cleaned and saved in database and csv file.",
        'data' : "cleaned_data",
    }

    response_data = jsonify(json_response)
    return response_data
                     
if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [05/Dec/2023 00:10:23] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [05/Dec/2023 00:10:26] "GET /docs/ HTTP/1.1" 200 -
127.0.0.1 - - [05/Dec/2023 00:10:26] "GET /flasgger_static/swagger-ui-bundle.js HTTP/1.1" 304 -
127.0.0.1 - - [05/Dec/2023 00:10:26] "GET /flasgger_static/swagger-ui.css HTTP/1.1" 304 -
127.0.0.1 - - [05/Dec/2023 00:10:26] "GET /flasgger_static/swagger-ui-standalone-preset.js HTTP/1.1" 304 -
127.0.0.1 - - [05/Dec/2023 00:10:26] "GET /flasgger_static/lib/jquery.min.js HTTP/1.1" 304 -
127.0.0.1 - - [05/Dec/2023 00:10:26] "GET /docs.json HTTP/1.1" 200 -


Received file: data.csv
DataFrame loaded successfully.


127.0.0.1 - - [05/Dec/2023 00:10:37] "POST /file-processing HTTP/1.1" 200 -
