#### <div class="alert alert-info"> Program for censoring bad Words/Phrases from a religional language text </div>


#### Importing the libraries

In [1]:
import pandas as pd
import re 
#import sklearn as sk
#from nltk.corpus import stopwords

#### Reading the data file which is in CSV

In [2]:
df = pd.read_csv('./CAR-hate-speech.csv',encoding='latin-1')
df.head()

Unnamed: 0,Word,Phrase
0,Arabou,A arabo so a ye ti fa que ala la
1,tueur,A bengue so a doit ti kiri na kodro ti ala
2,Touber,Arabo sans bouta
3,Djak,Assassins d'opposants
4,Arna,Bozizé et ses amis Tchadiens veulent encore fa...


#### Data cleaning: 
<ol><li>make everything lowercase</li>
    <li>remove all the puntuations which are not required</li>


In [3]:
df.Word = df.Word.str.lower()
df.Phrase = df.Phrase.str.lower()
df.head()

Unnamed: 0,Word,Phrase
0,arabou,a arabo so a ye ti fa que ala la
1,tueur,a bengue so a doit ti kiri na kodro ti ala
2,touber,arabo sans bouta
3,djak,assassins d'opposants
4,arna,bozizé et ses amis tchadiens veulent encore fa...


#### Masking the bad words : (only words)

In [4]:
def profanityFilter(text):
    
    brokenStr = text.split()
     
    badWordMask = '*@#$%@#$%^~@%^~@#$@#$%^~'
    
    for word in brokenStr:
        if word in df.Word.values:
            #print(word + ' <--Bad word!')
            text = text.replace(word,badWordMask[:len(word)])
    return text


In [5]:
for x in range(1):
    user_input = input("Enter text: ")
    print(profanityFilter(user_input))

Enter text: djak
*@#$


### Masking the phrases: 

In [6]:
df.Phrase.describe()

count              66
unique             66
top       les requins
freq                1
Name: Phrase, dtype: object

In [7]:
df['PhraseForCompare'] = df.Phrase.str.replace(' ','_')
df.head(1)

Unnamed: 0,Word,Phrase,PhraseForCompare
0,arabou,a arabo so a ye ti fa que ala la,a_arabo_so_a_ye_ti_fa_que_ala_la


### Demonstrating how the phrases should be masked

In [8]:
tct = "_assassins_d'opposantsDFGH"
badwordmask = '*@#$%@#$%^~@%^~@#$@#$%^~'

for txt in df.PhraseForCompare.loc[df.PhraseForCompare.str.contains\
             ("assassins_d'opposants",regex=False)==True].values:
    if txt.find(tct):
        tct = tct.replace(txt,badwordmask[:len(txt)])
        print(tct)

_*@#$%@#$%^~@%^~@#$@#$DFGH


## Working script: 
def phraseFiltering(usrtxt):
    text = usrtxt.lower()    #.replace(' ','_')<br>
    text_new= ''<br>
    badwordmask = '*@#$%@#$%^~@%^~@#$@#$%^~'     <br>
           
    for item in df.PhraseForCompare:
        for phrase in df.PhraseForCompare.loc\
        [df.PhraseForCompare.str.contains\
             (item,regex=False)==True].values:
            
            if phrase in text:
                text_new = text.replace(phrase,badwordmask[:len(phrase)])
                print(text_new)
                #break
                
    return text_new
           

### <div class="alert alert-danger">Problems with the above functions (both Word and Phrase):<br><ol><li>The user text is not always in lower case.</li><br><li>The program should not change the case of the user given text, that kills the purpose. The text output will not be usable after it masks the badwords.</li><br><li> The words and phrases should both be masked in the same user given text, these functions should be integrated to work together.</li></div>


### The correct way: Using regex to ignore Case of the user input text

In [9]:
usrtxt = 'Enfant batard laaaaaa c’est leur métier'

for item in df.Phrase:
    
    for phrase in df.Phrase.loc\
    [df.Phrase.str.contains\
         (item,regex=False)==True].values:
        
        phrase = re.search(phrase,usrtxt,re.I)
        
        if phrase!=None:
            print(phrase.group(0))
            usrtxt = usrtxt.replace(phrase.group(0),badwordmask[:len(phrase.group(0))])  #.capitalize() -> only the first word is Caps
            print(usrtxt)


Enfant batard
*@#$%@#$%^~@% laaaaaa c’est leur métier


### Integrating both Word and Phrase fitlerint features on user inputted text: 

In [10]:
def ProfanityFiltering():
    
    usrtxt = input('\nPlease enter text: ')
    
    badWordMask = '*@#$%@#$%^~@%^~@#$@#$%^~'     
    usrtxt1 = ''
    brokenStr = usrtxt.lower().split()
    
    
    for item in df.Word:
       
        for word in df.Word.loc\
        [df.Word.str.contains\
             (item,regex=False)==True].values:
        
            wordReplace = re.search(word,usrtxt,re.I)
        
            if wordReplace!=None:
                usrtxt1 = usrtxt.replace(wordReplace.group(0),badWordMask[:len(wordReplace.group(0))])

           
    for item in df.Phrase:
    
        for phrase in df.Phrase.loc\
        [df.Phrase.str.contains\
             (item,regex=False)==True].values:

            if usrtxt1!='':
                phrase = re.search(phrase,usrtxt1,re.I)
            else: 
                usrtxt1 = usrtxt
                phrase = re.search(phrase,usrtxt1,re.I)
                
            if phrase!=None:
                usrtxt1 =  usrtxt1.replace(phrase.group(0),badWordMask[:len(phrase.group(0))]) 
                
    print('\n\nThe verified text is as follows: \n ',usrtxt1)


##### Option 0 Function: Exit the program 
def funcExit():
    print("\nExiting...") #just printing Exit
           

### Taking user input: 

In [12]:
dictMenu = {'1':ProfanityFiltering, '0': funcExit}

goodChoice = True

while goodChoice:
    
    #Just print a print and a menu for UI
    print ("_"*90 + "\nOption 1: Verify Text\nOption 2: Upload and verify text")
        
    #User Input validation for whole numbers
    while True:
        usrIn = input("Please give an option from the above menu or press 0 to exit: ")
        if usrIn.isnumeric():
            break
        else:
            print ("\nPlease select the correct menu option")
    

#We are checking if the keys in dictMenu match the value of usrIn
    if usrIn in dictMenu.keys():
        if int(usrIn)==0: #0 is for exiting so has a different behavior
            dictMenu[usrIn]() #referencing the function () via dictMenu names
            break #come out of the loop and do not repeat the menu options
        else:
            dictMenu[usrIn]()
            
    else:
            print("\nPlease give proper choice") #Tell user to re-enter the value for menu if its
                                               # not in the ditcMenu [1,2,0] 


__________________________________________________________________________________________
Option 1: Verify Text
Option 2: Upload and verify text
Please give an option from the above menu or press 0 to exit: 1

Please enter text: Enfant batard laaaaaa c’est leur métier


The verified text is as follows: 
  *@#$%@#$%^~@% laaaaaa c’est leur métier
__________________________________________________________________________________________
Option 1: Verify Text
Option 2: Upload and verify text
Please give an option from the above menu or press 0 to exit: 1

Please enter text: Enfant batard laaaaaa c’est leur métier 2) Qu’on soit clairs tous les supporters de l’OL qui regrettent Genesio et/ou qui veulent son retour vous êtes priés de m’unfollow au plus vite bande d’immenses enfants de putes que vous êtes. Merci


The verified text is as follows: 
  *@#$%@#$%^~@% laaaaaa c’est leur métier 2) Qu’on soit clairs tous les supporters de l’OL qui regrettent Genesio et/ou qui veulent son retour vous

Test twitter:<br>
    1) Enfant batard laaaaaa c’est leur métier<br>
    2) Qu’on soit clairs tous les supporters de l’OL qui regrettent Genesio et/ou qui veulent son retour vous êtes priés de m’unfollow au plus vite bande d’immenses enfants de putes que vous êtes. Merci<br>
    3)<br>
    4)<br>
    5)<br>